From 3f6bc5c68cbaf17559cc3dd09bd712e67851037a Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Thu, 30 Mar 2023 09:44:47 +0300 Subject: [PATCH 01/29] golang_str: fix UCS2 builds + ./trun python -m pytest -vvsx golang/golang_str_test.py ==================================== test session starts ===================================== platform linux2 -- Python 2.7.18, pytest-4.6.11, py-1.11.0, pluggy-0.13.1 -- /home/kirr/src/tools/go/py2d.venv2023/bin/python cachedir: .pytest_cache rootdir: /home/kirr/src/tools/go/pygolang-xgpystr collected 64 items golang/golang_str_test.py::test_strings_basic Traceback (most recent call last): File "golang/_golang_str.pyx", line 2270, in golang._golang._xuniord return ord(u) ValueError: only single character unicode strings can be converted to Py_UCS4, got length 2 Exception ValueError: 'only single character unicode strings can be converted to Py_UCS4, got length 2' in 'golang._golang._utf8_decode_rune' ignored (py2d.venv2023) kirr@deca:~/src/tools/go/pygolang-xgpystr$ python Python 2.7.18 (tags/2.7-dirty:8d21aa21f2c, Mar 30 2023, 07:38:40) [GCC 10.2.1 20210110] on linux2 Type "help", "copyright", "credits" or "license" for more information. >>> from pygolang import * Traceback (most recent call last): File "", line 1, in ImportError: No module named pygolang >>> from golang import * >>> ord('xy') Traceback (most recent call last): File "", line 1, in TypeError: ord() expected a character, but string of length 2 found >>> ord(b'xy') Traceback (most recent call last): File "", line 1, in TypeError: ord() expected a character, but string of length 2 found >>> ord(u'xy') Traceback (most recent call last): File "", line 1, in TypeError: ord() expected a character, but string of length 2 found >>> ord(b('xy')) Traceback (most recent call last): File "", line 1, in TypeError: ord() expected a character, but string of length 2 found >>> ord(u('xy')) Traceback (most recent call last): File "golang/_golang_str.pyx", line 2270, in golang._golang._xuniord return ord(u) ValueError: only single character unicode strings can be converted to Py_UCS4, got length 2 Exception ValueError: 'only single character unicode strings can be converted to Py_UCS4, got length 2' in 'golang._golang._utf8_decode_rune' ignored Traceback (most recent call last): File "", line 1, in File "golang/_golang_str.pyx", line 157, in golang._golang.pyu us = _pyu(pyustr, s) File "golang/_golang_str.pyx", line 195, in golang._golang._pyu s = _utf8_decode_surrogateescape(s) File "golang/_golang_str.pyx", line 2198, in golang._golang._utf8_decode_surrogateescape emit(_xunichr(r)) File "golang/_golang_str.pyx", line 2286, in golang._golang._xunichr return unichr(0xd800 + (uh >> 10)) + \ ValueError: unichr() arg not in range(0x10000) (narrow Python build) It was broken in 50b8cb7e (strconv: Move functionality related to UTF8 encode/decode into _golang_str) --- golang/_golang_str.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index 989f7a3..c4cdef5 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2022 Nexedi SA and Contributors. +# Copyright (C) 2018-2023 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -1791,8 +1791,9 @@ cdef (int, int) _utf8_decode_rune(const uint8_t[::1] s): if _ucs2_build and len(r) == 2: try: return _xuniord(r), l - # e.g. TypeError: ord() expected a character, but string of length 2 found - except TypeError: + # py: TypeError: ord() expected a character, but string of length 2 found + # cy: ValueError: only single character unicode strings can be converted to Py_UCS4, got length 2 + except (TypeError, ValueError): l -= 1 continue From 8dc44e124fc24f873d8e0812a161ec498e6bf5f9 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Sun, 30 Apr 2023 22:18:09 +0300 Subject: [PATCH 02/29] fixup! golang_str: bstr/ustr pickle support In ebd18f3f the code was ok but there is a thinko in test: it needs to test all pickle protocols from 0 to _including_ HIGHEST_PROTOCOL. --- golang/golang_str_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/golang/golang_str_test.py b/golang/golang_str_test.py index 384e63e..da6b255 100644 --- a/golang/golang_str_test.py +++ b/golang/golang_str_test.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2022 Nexedi SA and Contributors. +# Copyright (C) 2018-2023 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -313,7 +313,7 @@ def test_strings_pickle(): us = u("май") #from pickletools import dis - for proto in range(0, pickle.HIGHEST_PROTOCOL): + for proto in range(0, pickle.HIGHEST_PROTOCOL+1): p_bs = pickle.dumps(bs, proto) #dis(p_bs) bs_ = pickle.loads(p_bs) From 9e2dab5002c919fdf7924defe03b3457af70da52 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Sun, 30 Apr 2023 22:25:06 +0300 Subject: [PATCH 03/29] golang_str: tests: Adjust test_strings_index2 not to depend on repr(ustr|bstr) repr(ustr|bstr) will change behaviour depending on whether we are running under regular python, or gpython with string types replaced by bstr/ustr. But this test is completely orthogonal to that. -> Let's untie it from particular repr behaviour by emitting verified items in quoted form + asserting their types in the code. --- golang/testprog/golang_test_str_index2.py | 13 +++++++----- golang/testprog/golang_test_str_index2.txt | 24 +++++++++++----------- 2 files changed, 20 insertions(+), 17 deletions(-) diff --git a/golang/testprog/golang_test_str_index2.py b/golang/testprog/golang_test_str_index2.py index f9790ec..4fb4790 100755 --- a/golang/testprog/golang_test_str_index2.py +++ b/golang/testprog/golang_test_str_index2.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright (C) 2022 Nexedi SA and Contributors. -# Kirill Smelkov +# Copyright (C) 2022-2023 Nexedi SA and Contributors. +# Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute # it under the terms of the GNU General Public License version 3, or (at your @@ -29,7 +29,8 @@ from __future__ import print_function, absolute_import -from golang import b, u +from golang import b, u, bstr, ustr +from golang.gcompat import qq def main(): @@ -37,8 +38,10 @@ def main(): bs = b("миру мир") def emit(what, uobj, bobj): - print("u"+what, repr(uobj)) - print("b"+what, repr(bobj)) + assert type(uobj) is ustr + assert type(bobj) is bstr + print("u"+what, qq(uobj)) + print("b"+what, qq(bobj)) emit("s", us, bs) emit("s[:]", us[:], bs[:]) diff --git a/golang/testprog/golang_test_str_index2.txt b/golang/testprog/golang_test_str_index2.txt index 5977e19..c18811a 100644 --- a/golang/testprog/golang_test_str_index2.txt +++ b/golang/testprog/golang_test_str_index2.txt @@ -1,12 +1,12 @@ -us u('миру мир') -bs b('миру мир') -us[:] u('миру мир') -bs[:] b('миру мир') -us[0:1] u('м') -bs[0:1] b(b'\xd0') -us[0:2] u('ми') -bs[0:2] b('м') -us[1:2] u('и') -bs[1:2] b(b'\xbc') -us[0:-1] u('миру ми') -bs[0:-1] b(b'миру ми\xd1') +us "миру мир" +bs "миру мир" +us[:] "миру мир" +bs[:] "миру мир" +us[0:1] "м" +bs[0:1] "\xd0" +us[0:2] "ми" +bs[0:2] "м" +us[1:2] "и" +bs[1:2] "\xbc" +us[0:-1] "миру ми" +bs[0:-1] "миру ми\xd1" From bf16f6851b2a9ffeb01b8dc3027dfadec00550c1 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 1 May 2023 10:04:31 +0300 Subject: [PATCH 04/29] golang_str: Fix bstr.decode to handle 'string-escape' codec properly On py2 str.decode('string-escape') returns str, not unicode and this property is actually being used and relied upon by Lib/pickle.py: https://github.com/python/cpython/blob/v2.7.18-0-g8d21aa21f2c/Lib/pickle.py#L967-L977 We promised bstr to be drop-in replacement for str on py2, so let's adjust its behaviour to match the original because if we do not, unpickling strings will break when str is replaced by bstr under gpython. Do not add bstr.encode yet until we hit a real case where it is actually used. --- golang/_golang_str.pyx | 3 +++ golang/golang_str_test.py | 11 +++++++++++ 2 files changed, 14 insertions(+) diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index c4cdef5..3294c7c 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -422,6 +422,9 @@ class pybstr(bytes): x = _utf8_decode_surrogateescape(self) else: x = bytes.decode(self, encoding, errors) + # on py2 e.g. bytes.decode('string-escape') returns bytes + if PY_MAJOR_VERSION < 3 and isinstance(x, bytes): + return pyb(x) return pyu(x) if PY_MAJOR_VERSION < 3: diff --git a/golang/golang_str_test.py b/golang/golang_str_test.py index da6b255..e200546 100644 --- a/golang/golang_str_test.py +++ b/golang/golang_str_test.py @@ -690,6 +690,17 @@ def test_strings_encodedecode(): with raises(UnicodeEncodeError): u_k8mir.encode('ascii') + # on py2 there are encodings for which bytes.decode returns bytes + # e.g. bytes.decode('string-escape') is actually used by pickle + # verify that this exact semantic is preserved + if six.PY3: + with raises(LookupError): bs.decode('hex') + with raises(LookupError): bs.decode('string-escape') + else: + _ = bs.decode('string-escape'); assert type(_) is bstr; assert _ == bs + _ = b(r'x\'y').decode('string-escape'); assert type(_) is bstr; assert _bdata(_) == b"x'y" + _ = b('616263').decode('hex'); assert type(_) is bstr; assert _bdata(_) == b"abc" + # verify string operations like `x * 3` for all cases from bytes, bytearray, unicode, bstr and ustr. @mark.parametrize('tx', (bytes, unicode, bytearray, bstr, ustr)) From 302b51c5292fcece7db194d46b3e7a32bece4d1f Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 1 May 2023 17:10:24 +0300 Subject: [PATCH 05/29] golang_str: tests: Make test_strings_methods more robust with upcoming unicode=ustr Previously test_strings_methods was testing a method via comparing bstr and ustr results of .method() with similar result of unicode.method(). This works reasonably ok. However under gpython, when unicode will be replaced with ustr, it will no longer compare results of bstr/ustr methods with something good and external - indeed in that case bstr/ustr .method() will be compared to result of ustr.method() which opens the door for bugs to stay unnoticed. -> Adjust the test to explicitly provide expected result for all entries in the test vector. We make sure those results are good and match std python because we also assert that unicode.method() matches it. --- golang/golang_str_test.py | 176 +++++++++++++++++++------------------- 1 file changed, 89 insertions(+), 87 deletions(-) diff --git a/golang/golang_str_test.py b/golang/golang_str_test.py index e200546..1c0c574 100644 --- a/golang/golang_str_test.py +++ b/golang/golang_str_test.py @@ -1530,7 +1530,10 @@ def test_strings_methods(): # argv and kw being various combinations of unicode,bstr,ustr, bytes/bytearray. def checkop(s, meth, *argv, **kw): assert type(s) is str - ok = kw.pop('ok', None) + ok = kw.pop('ok') + if six.PY2: + ok = deepReplaceStr(ok, xunicode) + optional = kw.pop('optional', False) bs = b(s) us = u(s) # verify {str,bstr,ustr}.meth with str arguments @@ -1545,13 +1548,11 @@ def checkop(s, meth, *argv, **kw): r = xcall(s, meth, *argv_unicode, **kw_unicode) # we provide fallback implementations on e.g. py2 - if ok is not None: - if six.PY2: - ok = xunicode(ok) - if isinstance(r, NotImplementedError): + if isinstance(r, NotImplementedError): + if not optional: r = ok - else: - assert r == ok + else: + assert r == ok assert type(s) is unicode br = xcall(bs, meth, *argv, **kw) @@ -1662,90 +1663,91 @@ def _(*argv, **kw): _ = Verifier - _("миру мир").__contains__("ру") - _("миру мир").__contains__("α") - _("мир").capitalize() - _("МиР").casefold() - _("мир").center(10) - _("мир").center(10, "ж") + _("миру мир").__contains__("ру", ok=True) + _("миру мир").__contains__("α", ok=False) + _("мир").capitalize( ok="Мир") + _("МиР").casefold( ok="мир", optional=True) # py3.3 + _("мир").center(10, ok=" мир ") + _("мир").center(10, "ж", ok="жжжмиржжжж") # count, endswith - tested in test_strings_index - _("миру\tмир").expandtabs() - _("миру\tмир").expandtabs(4) + _("миру\tмир").expandtabs( ok="миру мир") + _("миру\tмир").expandtabs(2, ok="миру мир") # find, index - tested in test_strings_index - _("мир").isalnum() - _("мир!").isalnum() - _("мир").isalpha() - _("мир!").isalpha() - _("мир").isascii() - _("hello").isascii() - _("hellЫ").isascii() - _("123 мир").isdecimal() - _("123 q").isdecimal() - _("123").isdecimal() - _("мир").isdigit() - _("123 мир").isdigit() - _("123 q").isdigit() - _("123").isdigit() - _("٤").isdigit() # arabic 4 - _("мир").isidentifier() - _("мир$").isidentifier() - _("мир").islower() - _("Мир").islower() - _("мир").isnumeric() - _("123").isnumeric() - _("0x123").isnumeric() - _("мир").isprintable() - _("\u2009").isspace() # thin space - _(" ").isspace() - _("мир").isspace() - _("мир").istitle() - _("Мир").istitle() - _(" мир ").join(["да", "май", "труд"]) - _("мир").ljust(10) - _("мир").ljust(10, 'ж') - _("МиР").lower() - _("\u2009 мир").lstrip() - _("\u2009 мир\u2009 ").lstrip() - _("мммир").lstrip('ми') - _("миру мир").partition('ру') - _("миру мир").partition('ж') - _("миру мир").removeprefix("мир") - _("миру мир").removesuffix("мир") - _("миру мир").replace("ир", "ж") - _("миру мир").replace("ир", "ж", 1) + _("мир").isalnum( ok=True) + _("мир!").isalnum( ok=False) + _("мир").isalpha( ok=True) + _("мир!").isalpha( ok=False) + _("мир").isascii( ok=False, optional=True) # py3.7 + _("hello").isascii( ok=True, optional=True) # py3.7 + _("hellЫ").isascii( ok=False, optional=True) # py3.7 + _("123 мир").isdecimal( ok=False) + _("123 q").isdecimal( ok=False) + _("123").isdecimal( ok=True) + _("мир").isdigit( ok=False) + _("123 мир").isdigit( ok=False) + _("123 q").isdigit( ok=False) + _("123").isdigit( ok=True) + _("٤").isdigit( ok=True) # arabic 4 + _("мир").isidentifier( ok=True, optional=True) # py3.0 + _("мир$").isidentifier( ok=False, optional=True) # py3.0 + _("мир").islower( ok=True) + _("Мир").islower( ok=False) + _("мир").isnumeric( ok=False) + _("123").isnumeric( ok=True) + _("0x123").isnumeric( ok=False) + _("мир").isprintable( ok=True, optional=True) # py3.0 + _("\u2009").isspace( ok=x32(True,False)) # thin space + _(" ").isspace( ok=True) + _("мир").isspace( ok=False) + _("мир").istitle( ok=False) + _("Мир").istitle( ok=True) + _("МИр").istitle( ok=False) + _(" мир ").join(["да", "май", "труд"], ok="да мир май мир труд") + _("мир").ljust(10, ok="мир ") + _("мир").ljust(10, 'ж', ok="миржжжжжжж") + _("МиР").lower( ok="мир") + _("\u2009 мир").lstrip( ok=x32("мир", "\u2009 мир")) + _("\u2009 мир\u2009 ").lstrip( ok=x32("мир\u2009 ", "\u2009 мир\u2009 ")) + _("мммир").lstrip('ми', ok="р") + _("миру мир").partition('ру', ok=("ми", "ру", " мир")) + _("миру мир").partition('ж', ok=("миру мир", "", "")) + _("миру мир").removeprefix("мир", ok="у мир", optional=True) # py3.9 + _("миру мир").removesuffix("мир", ok="миру ", optional=True) # py3.9 + _("миру мир").replace("ир", "ж", ok="мжу мж") + _("миру мир").replace("ир", "ж", 1, ok="мжу мир") # rfind, rindex - tested in test_strings_index - _("мир").rjust(10) - _("мир").rjust(10, 'ж') - _("миру мир").rpartition('ру') - _("миру мир").rpartition('ж') - _("мир").rsplit() - _("привет мир").rsplit() - _("привет\u2009мир").rsplit() - _("привет мир").rsplit("и") - _("привет мир").rsplit("и", 1) - _("мир \u2009").rstrip() - _(" мир \u2009").rstrip() - _("мируу").rstrip('ру') - _("мир").split() - _("привет мир").split() - _("привет\u2009мир").split() - _("привет мир").split("и") - _("привет мир").split("и", 1) - _("мир").splitlines() - _("миру\nмир").splitlines() - _("миру\nмир").splitlines(True) - _("миру\nмир\n").splitlines(True) - _("мир\nтруд\nмай\n").splitlines() - _("мир\nтруд\nмай\n").splitlines(True) + _("мир").rjust(10, ok=" мир") + _("мир").rjust(10, 'ж', ok="жжжжжжжмир") + _("миру мир").rpartition('ру', ok=("ми", "ру", " мир")) + _("миру мир").rpartition('ж', ok=("", "", "миру мир")) + _("мир").rsplit( ok=["мир"]) + _("привет мир").rsplit( ok=["привет", "мир"]) + _("привет\u2009мир").rsplit( ok=x32(["привет", "мир"], ["привет\u2009мир"])) + _("привет мир").rsplit("и", ok=["пр", "вет м", "р"]) + _("привет мир").rsplit("и", 1, ok=["привет м", "р"]) + _("мир \u2009").rstrip( ok=x32("мир", "мир \u2009")) + _(" мир \u2009").rstrip( ok=x32(" мир", " мир \u2009")) + _("мируу").rstrip('ру', ok="ми") + _("мир").split( ok=["мир"]) + _("привет мир").split( ok=["привет", "мир"]) + _("привет\u2009мир").split( ok=x32(['привет', 'мир'], ["привет\u2009мир"])) + _("привет мир").split("и", ok=["пр", "вет м", "р"]) + _("привет мир").split("и", 1, ok=["пр", "вет мир"]) + _("мир").splitlines( ok=["мир"]) + _("миру\nмир").splitlines( ok=["миру", "мир"]) + _("миру\nмир").splitlines(True, ok=["миру\n", "мир"]) + _("миру\nмир\n").splitlines(True, ok=["миру\n", "мир\n"]) + _("мир\nтруд\nмай\n").splitlines( ok=["мир", "труд", "май"]) + _("мир\nтруд\nмай\n").splitlines(True, ok=["мир\n", "труд\n", "май\n"]) # startswith - tested in test_strings_index - _("\u2009 мир \u2009").strip() - _("миру мир").strip('мир') - _("МиР").swapcase() - _("МиР").title() - _("мир").translate({ord(u'м'):ord(u'и'), ord(u'и'):'я', ord(u'р'):None}) - _("МиР").upper() - _("мир").zfill(10) - _("123").zfill(10) + _("\u2009 мир \u2009").strip( ok=x32("мир", "\u2009 мир \u2009")) + _("миру мир").strip('мир', ok="у ") + _("МиР").swapcase( ok="мИр") + _("МиР").title( ok="Мир") + _("мир").translate({ord(u'м'):ord(u'и'), ord(u'и'):'я', ord(u'р'):None}, ok="ия") + _("МиР").upper( ok="МИР") + _("мир").zfill(10, ok="0000000мир") + _("123").zfill(10, ok="0000000123") # verify bstr.translate in bytes mode From e75d21fdf22459e40bb2fc99cd4a4920e47dc9e8 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 1 May 2023 17:38:18 +0300 Subject: [PATCH 06/29] gpython: tests: Factorize test_Xruntime Factor-out subroutine to run tfunc in subprocess interpreter spawned with `-X xopt=xval`. This helps clarity and later in addition to `-X gpython.runtime` we will also need it to verify `-X gpython.strings`. --- gpython/gpython_test.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/gpython/gpython_test.py b/gpython/gpython_test.py index f38b56a..e7d1336 100644 --- a/gpython/gpython_test.py +++ b/gpython/gpython_test.py @@ -359,20 +359,26 @@ def test_pymain_run_via_relpath(): out2 = pyout(['./__init__.py'] + argv, pyexe=sys._gpy_underlying_executable, cwd=here) assert out1 == out2 + # verify -X gpython.runtime=... @gpython_only def test_Xruntime(runtime): + _xopt_assert_in_subprocess('gpython.runtime', runtime, + assert_gevent_activated if runtime != 'threads' else \ + assert_gevent_not_activated) + +# _xopt_assert_in_subprocess runs tfunc in subprocess interpreter spawned with +# `-X xopt=xval` and checks that there is no error. +def _xopt_assert_in_subprocess(xopt, xval, tfunc): + XOPT = xopt.upper().replace('.','_') # gpython.runtime -> GPYTHON_RUNTIME env = os.environ.copy() - env.pop('GPYTHON_RUNTIME', None) # del + env.pop(XOPT, None) # del argv = [] - if runtime != '': - argv += ['-X', 'gpython.runtime='+runtime] + if xval != '': + argv += ['-X', xopt+'='+xval] prog = 'from gpython import gpython_test as t; ' - if runtime != 'threads': - prog += 't.assert_gevent_activated(); ' - else: - prog += 't.assert_gevent_not_activated(); ' + prog += 't.%s(); ' % tfunc.__name__ prog += 'print("ok")' argv += ['-c', prog] From 5716b1303c6caf803d4042116e1748fe10b06208 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 1 May 2023 17:48:40 +0300 Subject: [PATCH 07/29] gpython: Fix `gpython -X gpython.runtime=threads` to spawn subinterpreters with threads runtime by default Previously it was not the case and gpython with default being gevent runtime was spawned even if parent gpython was instructed to use threads runtime: (z-dev) kirr@deca:~/src/tools/go/pygolang$ gpython -X gpython.runtime=threads Python 2.7.18 (default, Apr 28 2021, 17:39:59) [GCC 10.2.1 20210110] [GPython 0.1] [threads] on linux2 Type "help", "copyright", "credits" or "license" for more information. (InteractiveConsole) >>> import sys >>> sys.version '2.7.18 (default, Apr 28 2021, 17:39:59) \n[GCC 10.2.1 20210110] [GPython 0.1] [threads]' <-- NOTE threads >>> import subprocess subprocess.call(sys.executable)ble) Python 2.7.18 (default, Apr 28 2021, 17:39:59) [GCC 10.2.1 20210110] [GPython 0.1] [gevent 21.1.2] on linux2 <-- NOTE gevent Type "help", "copyright", "credits" or "license" for more information. (InteractiveConsole) >>> --- gpython/__init__.py | 5 +++++ gpython/gpython_test.py | 7 ++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/gpython/__init__.py b/gpython/__init__.py index 4a3a7ff..45b0a16 100755 --- a/gpython/__init__.py +++ b/gpython/__init__.py @@ -408,6 +408,11 @@ def main(): argv = [sys.argv[0]] + argv_ + igetopt.argv + # propagate those settings as defaults to subinterpreters, so that e.g. + # sys.executable spawned from under `gpython -X gpython.runtime=threads` + # also uses "threads" runtime by default. + os.environ['GPYTHON_RUNTIME'] = gpy_runtime + # init initializes according to selected runtime # it is called after options are parsed and sys.path is setup correspondingly. # this way golang and gevent are imported from exactly the same place as diff --git a/gpython/gpython_test.py b/gpython/gpython_test.py index e7d1336..420d0d4 100644 --- a/gpython/gpython_test.py +++ b/gpython/gpython_test.py @@ -369,6 +369,9 @@ def test_Xruntime(runtime): # _xopt_assert_in_subprocess runs tfunc in subprocess interpreter spawned with # `-X xopt=xval` and checks that there is no error. +# +# It is also verified that tfunc runs ok in sub-subprocess interpreter spawned +# _without_ `-X ...`, i.e. once given -X setting is inherited by spawned interpreters. def _xopt_assert_in_subprocess(xopt, xval, tfunc): XOPT = xopt.upper().replace('.','_') # gpython.runtime -> GPYTHON_RUNTIME env = os.environ.copy() @@ -377,8 +380,10 @@ def _xopt_assert_in_subprocess(xopt, xval, tfunc): argv = [] if xval != '': argv += ['-X', xopt+'='+xval] - prog = 'from gpython import gpython_test as t; ' + prog = import_t = 'from gpython import gpython_test as t; ' prog += 't.%s(); ' % tfunc.__name__ + prog += import_t # + same in subprocess + prog += "t.pyrun(['-c', '%s t.%s(); ']); " % (import_t, tfunc.__name__) prog += 'print("ok")' argv += ['-c', prog] From 21fab97588e19f6c596f94978e7cb266f5b02608 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 1 May 2023 17:59:34 +0300 Subject: [PATCH 08/29] gpython: Fix thinko when rejecting unknown -X option Before: (z-dev) kirr@deca:~/src/tools/go/pygolang$ gpython -X gpython.zzz Traceback (most recent call last): File "/home/kirr/src/wendelin/venv/z-dev/bin/gpython", line 3, in from gpython import main; main() File "/home/kirr/src/tools/go/pygolang/gpython/__init__.py", line 397, in main raise RuntimeError('gpython: unknown -X option %s' % opt) RuntimeError: gpython: unknown -X option -X <-- NOTE After: (z-dev) kirr@deca:~/src/tools/go/pygolang$ gpython -X gpython.zzz Traceback (most recent call last): File "/home/kirr/src/wendelin/venv/z-dev/bin/gpython", line 3, in from gpython import main; main() File "/home/kirr/src/tools/go/pygolang/gpython/__init__.py", line 397, in main raise RuntimeError('gpython: unknown -X option %s' % arg) RuntimeError: gpython: unknown -X option gpython.zzz <-- NOTE --- gpython/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpython/__init__.py b/gpython/__init__.py index 45b0a16..c6be786 100755 --- a/gpython/__init__.py +++ b/gpython/__init__.py @@ -394,7 +394,7 @@ def main(): sys._xoptions['gpython.runtime'] = gpy_runtime else: - raise RuntimeError('gpython: unknown -X option %s' % opt) + raise RuntimeError('gpython: unknown -X option %s' % arg) continue From 4546aaecf80f348b811da94772edf34ba75f922c Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Sun, 26 Mar 2023 17:06:30 +0300 Subject: [PATCH 09/29] golang_str: Switch bstr/ustr to cdef classes For gpython to switch builtin str/unicode to bstr/ustr we will need bstr/ustr to have exactly the same C layout as builtin string types. This is possible to achieve only via `cdef class`. It is also good to switch to `cdef class` for RAM savings - from https://github.com/cython/cython/pull/5212#issuecomment-1387659026 : # what Cython does at runtime for `class MyBytes(bytes)` In [3]: MyBytes = type('MyBytes', (bytes,), {'__slots__': ()}) In [4]: MyBytes Out[4]: __main__.MyBytes In [5]: a = bytes(b'123') In [6]: b = MyBytes(b'123') In [7]: a Out[7]: b'123' In [8]: b Out[8]: b'123' In [9]: a == b Out[9]: True In [10]: import sys In [11]: sys.getsizeof(a) Out[11]: 36 In [12]: sys.getsizeof(b) Out[12]: 52 So with `cdef class` we gain more control and optimize memory usage. This was not done before because cython forbids to `cdef class X(bytes)` due to https://github.com/cython/cython/issues/711. We work it around in setup.py with draft for proper patch pre-posted to upstream in https://github.com/cython/cython/pull/5212 . --- golang/_golang_str.pyx | 174 +++++++++++++++++++++++++++++--------- golang/golang_str_test.py | 21 ++++- setup.py | 19 +++++ 3 files changed, 173 insertions(+), 41 deletions(-) diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index 3294c7c..59cfe6d 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -29,6 +29,15 @@ from cpython cimport Py_EQ, Py_NE, Py_LT, Py_GT, Py_LE, Py_GE from cpython.iterobject cimport PySeqIter_New from cpython cimport PyThreadState_GetDict, PyDict_SetItem from cpython cimport PyObject_CheckBuffer + +cdef extern from "Python.h": + ctypedef struct PyBytesObject: + pass + +cdef extern from "Python.h": + ctypedef struct PyUnicodeObject: + pass + cdef extern from "Python.h": """ #if PY_MAJOR_VERSION < 3 @@ -45,6 +54,7 @@ cdef extern from "Python.h": cdef extern from "Python.h": ctypedef int (*initproc)(object, PyObject *, PyObject *) except -1 ctypedef struct _XPyTypeObject "PyTypeObject": + PyObject* tp_new(PyTypeObject*, PyObject*, PyObject*) except NULL initproc tp_init PySequenceMethods *tp_as_sequence @@ -54,6 +64,8 @@ cdef extern from "Python.h": object (*sq_slice) (object, Py_ssize_t, Py_ssize_t) # present only on py2 +from cython cimport no_gc + from libc.stdint cimport uint8_t from libc.stdio cimport FILE @@ -128,7 +140,12 @@ cdef _pyb(bcls, s): # -> ~bstr | None return None assert type(s) is bytes - return bytes.__new__(bcls, s) + # like bytes.__new__(bcls, s) but call bytes.tp_new directly + # else tp_new_wrapper complains because pybstr.tp_new != bytes.tp_new + argv = (s,) + obj = (<_XPyTypeObject*>bytes).tp_new(bcls, argv, NULL) + Py_DECREF(obj) + return obj cdef _pyu(ucls, s): # -> ~ustr | None if type(s) is ucls: @@ -147,7 +164,12 @@ cdef _pyu(ucls, s): # -> ~ustr | None return None assert type(s) is unicode - return unicode.__new__(ucls, s) + # like unicode .__new__(bcls, s) but call unicode.tp_new directly + # else tp_new_wrapper complains because pyustr.tp_new != unicode.tp_new + argv = (s,) + obj = (<_XPyTypeObject*>unicode).tp_new(ucls, argv, NULL) + Py_DECREF(obj) + return obj # _ifbuffer_data returns contained data if obj provides buffer interface. cdef _ifbuffer_data(obj): # -> bytes|None @@ -220,8 +242,8 @@ def pyuchr(int i): # -> 1-character ustr return pyu(unichr(i)) -# XXX cannot `cdef class`: github.com/cython/cython/issues/711 -class pybstr(bytes): +@no_gc # note setup.py assist this to compile despite +cdef class pybstr(bytes): # https://github.com/cython/cython/issues/711 """bstr is byte-string. It is based on bytes and can automatically convert to/from unicode. @@ -253,11 +275,10 @@ class pybstr(bytes): See also: b, ustr/u. """ - # don't allow to set arbitrary attributes. - # won't be needed after switch to -> `cdef class` - __slots__ = () - - def __new__(cls, object='', encoding=None, errors=None): + # XXX due to "cannot `cdef class` with __new__" (https://github.com/cython/cython/issues/799) + # pybstr.__new__ is hand-made in _pybstr_tp_new which invokes ↓ .____new__() . + @staticmethod + def ____new__(cls, object='', encoding=None, errors=None): # encoding or errors -> object must expose buffer interface if not (encoding is None and errors is None): object = _buffer_decode(object, encoding, errors) @@ -360,8 +381,10 @@ class pybstr(bytes): def __add__(a, b): # NOTE Cython < 3 does not automatically support __radd__ for cdef class # https://cython.readthedocs.io/en/latest/src/userguide/migrating_to_cy30.html#arithmetic-special-methods - # but pybstr is currently _not_ cdef'ed class # see also https://github.com/cython/cython/issues/4750 + if type(a) is not pybstr: + assert type(b) is pybstr + return b.__radd__(a) return pyb(bytes.__add__(a, _pyb_coerce(b))) def __radd__(b, a): @@ -377,6 +400,9 @@ class pybstr(bytes): # __mul__, __rmul__ (no need to override __imul__) def __mul__(a, b): + if type(a) is not pybstr: + assert type(b) is pybstr + return b.__rmul__(a) return pyb(bytes.__mul__(a, b)) def __rmul__(b, a): return b.__mul__(a) @@ -436,8 +462,7 @@ class pybstr(bytes): # all other string methods def capitalize(self): return pyb(pyu(self).capitalize()) - if _strhas('casefold'): # py3.3 TODO provide py2 implementation - def casefold(self): return pyb(pyu(self).casefold()) + def casefold(self): return pyb(pyu(self).casefold()) def center(self, width, fillchar=' '): return pyb(pyu(self).center(width, fillchar)) def count(self, sub, start=None, end=None): return bytes.count(self, _pyb_coerce(sub), start, end) @@ -463,12 +488,10 @@ class pybstr(bytes): # isascii(self) no need to override def isdecimal(self): return pyu(self).isdecimal() def isdigit(self): return pyu(self).isdigit() - if _strhas('isidentifier'): # py3 TODO provide fallback implementation - def isidentifier(self): return pyu(self).isidentifier() + def isidentifier(self): return pyu(self).isidentifier() def islower(self): return pyu(self).islower() def isnumeric(self): return pyu(self).isnumeric() - if _strhas('isprintable'): # py3 TODO provide fallback implementation - def isprintable(self): return pyu(self).isprintable() + def isprintable(self): return pyu(self).isprintable() def isspace(self): return pyu(self).isspace() def istitle(self): return pyu(self).istitle() @@ -477,10 +500,8 @@ class pybstr(bytes): def lower(self): return pyb(pyu(self).lower()) def lstrip(self, chars=None): return pyb(pyu(self).lstrip(chars)) def partition(self, sep): return tuple(pyb(_) for _ in bytes.partition(self, _pyb_coerce(sep))) - if _strhas('removeprefix'): # py3.9 TODO provide fallback implementation - def removeprefix(self, prefix): return pyb(pyu(self).removeprefix(prefix)) - if _strhas('removesuffix'): # py3.9 TODO provide fallback implementation - def removesuffix(self, suffix): return pyb(pyu(self).removesuffix(suffix)) + def removeprefix(self, prefix): return pyb(pyu(self).removeprefix(prefix)) + def removesuffix(self, suffix): return pyb(pyu(self).removesuffix(suffix)) def replace(self, old, new, count=-1): return pyb(bytes.replace(self, _pyb_coerce(old), _pyb_coerce(new), count)) # NOTE rfind/rindex & friends should return byte-position, not unicode-position @@ -528,8 +549,35 @@ class pybstr(bytes): return pyustr.maketrans(x, y, z) -# XXX cannot `cdef class` with __new__: https://github.com/cython/cython/issues/799 -class pyustr(unicode): +# hand-made pybstr.__new__ (workaround for https://github.com/cython/cython/issues/799) +cdef PyObject* _pybstr_tp_new(PyTypeObject* _cls, PyObject* _argv, PyObject* _kw) except NULL: + argv = () + if _argv != NULL: + argv = _argv + kw = {} + if _kw != NULL: + kw = _kw + + cdef object x = pybstr.____new__(_cls, *argv, **kw) + Py_INCREF(x) + return x +(<_XPyTypeObject*>pybstr).tp_new = &_pybstr_tp_new + +# bytes uses "optimized" and custom .tp_basicsize and .tp_itemsize: +# https://github.com/python/cpython/blob/v2.7.18-0-g8d21aa21f2c/Objects/stringobject.c#L26-L32 +# https://github.com/python/cpython/blob/v2.7.18-0-g8d21aa21f2c/Objects/stringobject.c#L3816-L3820 +(pybstr) .tp_basicsize = (bytes).tp_basicsize +(pybstr) .tp_itemsize = (bytes).tp_itemsize + +# make sure pybstr C layout corresponds to bytes C layout exactly +# we patched cython to allow from-bytes cdef class inheritance and we also set +# .tp_basicsize directly above. All this works ok only if C layouts for pybstr +# and bytes are completely the same. +assert sizeof(pybstr) == sizeof(PyBytesObject) + + +@no_gc +cdef class pyustr(unicode): """ustr is unicode-string. It is based on unicode and can automatically convert to/from bytes. @@ -556,11 +604,10 @@ class pyustr(unicode): See also: u, bstr/b. """ - # don't allow to set arbitrary attributes. - # won't be needed after switch to -> `cdef class` - __slots__ = () - - def __new__(cls, object='', encoding=None, errors=None): + # XXX due to "cannot `cdef class` with __new__" (https://github.com/cython/cython/issues/799) + # pyustr.__new__ is hand-made in _pyustr_tp_new which invokes ↓ .____new__() . + @staticmethod + def ____new__(cls, object='', encoding=None, errors=None): # encoding or errors -> object must expose buffer interface if not (encoding is None and errors is None): object = _buffer_decode(object, encoding, errors) @@ -652,8 +699,10 @@ class pyustr(unicode): def __add__(a, b): # NOTE Cython < 3 does not automatically support __radd__ for cdef class # https://cython.readthedocs.io/en/latest/src/userguide/migrating_to_cy30.html#arithmetic-special-methods - # but pyustr is currently _not_ cdef'ed class # see also https://github.com/cython/cython/issues/4750 + if type(a) is not pyustr: + assert type(b) is pyustr + return b.__radd__(a) return pyu(unicode.__add__(a, _pyu_coerce(b))) def __radd__(b, a): @@ -671,6 +720,9 @@ class pyustr(unicode): # __mul__, __rmul__ (no need to override __imul__) def __mul__(a, b): + if type(a) is not pyustr: + assert type(b) is pyustr + return b.__rmul__(a) return pyu(unicode.__mul__(a, b)) def __rmul__(b, a): return b.__mul__(a) @@ -723,8 +775,7 @@ class pyustr(unicode): # all other string methods def capitalize(self): return pyu(unicode.capitalize(self)) - if _strhas('casefold'): # py3.3 TODO provide fallback implementation - def casefold(self): return pyu(unicode.casefold(self)) + def casefold(self): return pyu(unicode.casefold(self)) def center(self, width, fillchar=' '): return pyu(unicode.center(self, width, _pyu_coerce(fillchar))) def count(self, sub, start=None, end=None): # cython optimizes unicode.count to directly call PyUnicode_Count - @@ -768,10 +819,8 @@ class pyustr(unicode): def lower(self): return pyu(unicode.lower(self)) def lstrip(self, chars=None): return pyu(unicode.lstrip(self, _xpyu_coerce(chars))) def partition(self, sep): return tuple(pyu(_) for _ in unicode.partition(self, _pyu_coerce(sep))) - if _strhas('removeprefix'): # py3.9 TODO provide fallback implementation - def removeprefix(self, prefix): return pyu(unicode.removeprefix(self, _pyu_coerce(prefix))) - if _strhas('removesuffix'): # py3.9 TODO provide fallback implementation - def removesuffix(self, suffix): return pyu(unicode.removesuffix(self, _pyu_coerce(suffix))) + def removeprefix(self, prefix): return pyu(unicode.removeprefix(self, _pyu_coerce(prefix))) + def removesuffix(self, suffix): return pyu(unicode.removesuffix(self, _pyu_coerce(suffix))) def replace(self, old, new, count=-1): return pyu(unicode.replace(self, _pyu_coerce(old), _pyu_coerce(new), count)) def rfind(self, sub, start=None, end=None): if start is None: start = 0 @@ -864,6 +913,24 @@ class pyustr(unicode): return t +# hand-made pyustr.__new__ (workaround for https://github.com/cython/cython/issues/799) +cdef PyObject* _pyustr_tp_new(PyTypeObject* _cls, PyObject* _argv, PyObject* _kw) except NULL: + argv = () + if _argv != NULL: + argv = _argv + kw = {} + if _kw != NULL: + kw = _kw + + cdef object x = pyustr.____new__(_cls, *argv, **kw) + Py_INCREF(x) + return x +(<_XPyTypeObject*>pyustr).tp_new = &_pyustr_tp_new + +# similarly to bytes - want same C layout for pyustr vs unicode +assert sizeof(pyustr) == sizeof(PyUnicodeObject) + + # _pyustrIter wraps unicode iterator to return pyustr for each yielded character. cdef class _pyustrIter: cdef object uiter @@ -941,6 +1008,31 @@ if PY2: (<_XPyTypeObject*>pyustr) .tp_as_sequence.sq_slice = NULL +# ---- adjust bstr/ustr classes after what cython generated ---- + +# remove unsupported bstr/ustr methods. do it outside of `cdef class` to +# workaround https://github.com/cython/cython/issues/4556 (`if ...` during +# `cdef class` is silently handled wrongly) +cdef _bstrustr_remove_unsupported_slots(): + vslot = ( + 'casefold', # py3.3 TODO provide py2 implementation + 'isidentifier', # py3 TODO provide fallback implementation + 'isprintable', # py3 TODO provide fallback implementation + 'removeprefix', # py3.9 TODO provide fallback implementation + 'removesuffix', # py3.9 TODO provide fallback implementation + ) + for slot in vslot: + if not hasattr(unicode, slot): + _patch_slot(pybstr, slot, DEL) + try: + _patch_slot(pyustr, slot, DEL) + except KeyError: # e.g. we do not define ustr.isprintable ourselves + pass +_bstrustr_remove_unsupported_slots() + + +# ---- quoting ---- + # _bpysmartquote_u3b2 quotes bytes/bytearray s the same way python would do for string. # # nonascii_escape indicates whether \xNN with NN >= 0x80 is present in the output. @@ -1321,12 +1413,15 @@ cdef _InBStringify _inbstringify_get(): # # if func_or_descr is descriptor (has __get__), it is installed as is. # otherwise it is wrapped with "unbound method" descriptor. +# +# if func_or_descr is DEL the slot is removed from typ's __dict__. +cdef DEL = object() cdef _patch_slot(PyTypeObject* typ, str name, object func_or_descr): typdict = (typ.tp_dict) #print("\npatching %s.%s with %r" % (typ.tp_name, name, func_or_descr)) #print("old: %r" % typdict.get(name)) - if hasattr(func_or_descr, '__get__'): + if hasattr(func_or_descr, '__get__') or func_or_descr is DEL: descr = func_or_descr else: func = func_or_descr @@ -1335,7 +1430,10 @@ cdef _patch_slot(PyTypeObject* typ, str name, object func_or_descr): else: descr = _UnboundMethod(func) - typdict[name] = descr + if descr is DEL: + del typdict[name] + else: + typdict[name] = descr #print("new: %r" % typdict.get(name)) PyType_Modified(typ) @@ -1686,10 +1784,6 @@ class _BFormatter(pystring.Formatter): # ---- misc ---- -# _strhas returns whether unicode string type has specified method. -cdef bint _strhas(str meth) except *: - return hasattr(unicode, meth) - cdef object _xpyu_coerce(obj): return _pyu_coerce(obj) if obj is not None else None diff --git a/golang/golang_str_test.py b/golang/golang_str_test.py index 1c0c574..71c4cff 100644 --- a/golang/golang_str_test.py +++ b/golang/golang_str_test.py @@ -31,7 +31,7 @@ import six from six import text_type as unicode, unichr from six.moves import range as xrange -import re, pickle, copy, types +import gc, re, pickle, copy, types import array, collections @@ -284,6 +284,25 @@ def _(text, breprok, ureprok): bs.hello = 1 +# verify that bstr/ustr are created with correct refcount. +def test_strings_refcount(): + # first verify our logic on std type + obj = xbytes(u'abc'); assert type(obj) is bytes + gc.collect(); assert sys.getrefcount(obj) == 1+1 # +1 due to obj passed to getrefcount call + + # bstr + obj = b('abc'); assert type(obj) is bstr + gc.collect(); assert sys.getrefcount(obj) == 1+1 + obj = bstr('abc'); assert type(obj) is bstr + gc.collect(); assert sys.getrefcount(obj) == 1+1 + + # ustr + obj = u('abc'); assert type(obj) is ustr + gc.collect(); assert sys.getrefcount(obj) == 1+1 + obj = ustr('abc'); assert type(obj) is ustr + gc.collect(); assert sys.getrefcount(obj) == 1+1 + + # verify memoryview(bstr|ustr). def test_strings_memoryview(): bs = b('мир') diff --git a/setup.py b/setup.py index 37e9f9d..bd5148b 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,25 @@ # See COPYING file for full licensing terms. # See https://www.nexedi.com/licensing for rationale and options. +# patch cython to allow `cdef class X(bytes)` while building pygolang to +# workaround https://github.com/cython/cython/issues/711 +# see `cdef class pybstr` in golang/_golang_str.pyx for details. +# (should become unneeded with cython 3 once https://github.com/cython/cython/pull/5212 is finished) +import inspect +from Cython.Compiler.PyrexTypes import BuiltinObjectType +def pygo_cy_builtin_type_name_set(self, v): + self._pygo_name = v +def pygo_cy_builtin_type_name_get(self): + name = self._pygo_name + if name == 'bytes': + caller = inspect.currentframe().f_back.f_code.co_name + if caller == 'analyse_declarations': + # need anything different from 'bytes' to deactivate check in + # https://github.com/cython/cython/blob/c21b39d4/Cython/Compiler/Nodes.py#L4759-L4762 + name = 'xxx' + return name +BuiltinObjectType.name = property(pygo_cy_builtin_type_name_get, pygo_cy_builtin_type_name_set) + from setuptools import find_packages from setuptools.command.install_scripts import install_scripts as _install_scripts from setuptools.command.develop import develop as _develop From 7fd58d3433ae217218afbf4e890a76cdf0dd70d4 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Sun, 26 Mar 2023 20:25:31 +0300 Subject: [PATCH 10/29] golang_str: Invoke bytes/unicode methods via zbytes/zunicode GPython will patch builtin bytes and unicode types. zbytes and zunicode will refer to original unpatched types. We will use them to invoke original bytes/unicode methods. NOTE we will test against bytes/unicode - not zbytes/zunicode - when inspecting type of objects. In other words we will use original bytes/unicode types only to refer to their original methods and code. --- golang/_golang_str.pyx | 180 +++++++++++++++++++++-------------------- 1 file changed, 94 insertions(+), 86 deletions(-) diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index 59cfe6d..fae05e8 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -31,10 +31,12 @@ from cpython cimport PyThreadState_GetDict, PyDict_SetItem from cpython cimport PyObject_CheckBuffer cdef extern from "Python.h": + PyTypeObject PyBytes_Type ctypedef struct PyBytesObject: pass cdef extern from "Python.h": + PyTypeObject PyUnicode_Type ctypedef struct PyUnicodeObject: pass @@ -80,6 +82,12 @@ else: import copy_reg as pycopyreg +# zbytes/zunicode point to original std bytes/unicode types even if they will be patched. +# we use them to invoke original bytes/unicode methods. +cdef object zbytes = (&PyBytes_Type) +cdef object zunicode = (&PyUnicode_Type) + + def pyb(s): # -> bstr """b converts object to bstr. @@ -140,10 +148,10 @@ cdef _pyb(bcls, s): # -> ~bstr | None return None assert type(s) is bytes - # like bytes.__new__(bcls, s) but call bytes.tp_new directly - # else tp_new_wrapper complains because pybstr.tp_new != bytes.tp_new + # like zbytes.__new__(bcls, s) but call zbytes.tp_new directly + # else tp_new_wrapper complains because pybstr.tp_new != zbytes.tp_new argv = (s,) - obj = (<_XPyTypeObject*>bytes).tp_new(bcls, argv, NULL) + obj = (<_XPyTypeObject*>zbytes).tp_new(bcls, argv, NULL) Py_DECREF(obj) return obj @@ -164,10 +172,10 @@ cdef _pyu(ucls, s): # -> ~ustr | None return None assert type(s) is unicode - # like unicode .__new__(bcls, s) but call unicode.tp_new directly - # else tp_new_wrapper complains because pyustr.tp_new != unicode.tp_new + # like zunicode .__new__(bcls, s) but call zunicode.tp_new directly + # else tp_new_wrapper complains because pyustr.tp_new != zunicode.tp_new argv = (s,) - obj = (<_XPyTypeObject*>unicode).tp_new(ucls, argv, NULL) + obj = (<_XPyTypeObject*>zunicode).tp_new(ucls, argv, NULL) Py_DECREF(obj) return obj @@ -317,7 +325,7 @@ cdef class pybstr(bytes): # https://github.com/cython/cython/issues/711 # retrieve state, which gives bstr, not bytes. Fix state to be bytes ourselves. def __reduce_ex__(self, protocol): if protocol >= 2: - return bytes.__reduce_ex__(self, protocol) + return zbytes.__reduce_ex__(self, protocol) return ( pycopyreg._reconstructor, (self.__class__, self.__class__, _bdata(self)) @@ -332,7 +340,7 @@ cdef class pybstr(bytes): # https://github.com/cython/cython/issues/711 if PY_MAJOR_VERSION >= 3: return hash(pyu(self)) else: - return bytes.__hash__(self) + return zbytes.__hash__(self) # == != < > <= >= # NOTE == and != are special: they must succeed against any type so that @@ -342,18 +350,18 @@ cdef class pybstr(bytes): # https://github.com/cython/cython/issues/711 b = _pyb_coerce(b) except TypeError: return False - return bytes.__eq__(a, b) + return zbytes.__eq__(a, b) def __ne__(a, b): return not a.__eq__(b) - def __lt__(a, b): return bytes.__lt__(a, _pyb_coerce(b)) - def __gt__(a, b): return bytes.__gt__(a, _pyb_coerce(b)) - def __le__(a, b): return bytes.__le__(a, _pyb_coerce(b)) - def __ge__(a, b): return bytes.__ge__(a, _pyb_coerce(b)) + def __lt__(a, b): return zbytes.__lt__(a, _pyb_coerce(b)) + def __gt__(a, b): return zbytes.__gt__(a, _pyb_coerce(b)) + def __le__(a, b): return zbytes.__le__(a, _pyb_coerce(b)) + def __ge__(a, b): return zbytes.__ge__(a, _pyb_coerce(b)) # len - no need to override # [], [:] def __getitem__(self, idx): - x = bytes.__getitem__(self, idx) + x = zbytes.__getitem__(self, idx) if type(idx) is slice: return pyb(x) else: @@ -374,7 +382,7 @@ cdef class pybstr(bytes): # https://github.com/cython/cython/issues/711 def __contains__(self, key): # NOTE on py3 bytes.__contains__ accepts numbers and buffers. We don't want to # automatically coerce any of them to bytestrings - return bytes.__contains__(self, _pyb_coerce(key)) + return zbytes.__contains__(self, _pyb_coerce(key)) # __add__, __radd__ (no need to override __iadd__) @@ -385,7 +393,7 @@ cdef class pybstr(bytes): # https://github.com/cython/cython/issues/711 if type(a) is not pybstr: assert type(b) is pybstr return b.__radd__(a) - return pyb(bytes.__add__(a, _pyb_coerce(b))) + return pyb(zbytes.__add__(a, _pyb_coerce(b))) def __radd__(b, a): # a.__add__(b) returned NotImplementedError, e.g. for unicode.__add__(bstr) @@ -403,7 +411,7 @@ cdef class pybstr(bytes): # https://github.com/cython/cython/issues/711 if type(a) is not pybstr: assert type(b) is pybstr return b.__rmul__(a) - return pyb(bytes.__mul__(a, b)) + return pyb(zbytes.__mul__(a, b)) def __rmul__(b, a): return b.__mul__(a) @@ -447,7 +455,7 @@ cdef class pybstr(bytes): # https://github.com/cython/cython/issues/711 if encoding == 'utf-8' and errors == 'surrogateescape': x = _utf8_decode_surrogateescape(self) else: - x = bytes.decode(self, encoding, errors) + x = zbytes.decode(self, encoding, errors) # on py2 e.g. bytes.decode('string-escape') returns bytes if PY_MAJOR_VERSION < 3 and isinstance(x, bytes): return pyb(x) @@ -465,7 +473,7 @@ cdef class pybstr(bytes): # https://github.com/cython/cython/issues/711 def casefold(self): return pyb(pyu(self).casefold()) def center(self, width, fillchar=' '): return pyb(pyu(self).center(width, fillchar)) - def count(self, sub, start=None, end=None): return bytes.count(self, _pyb_coerce(sub), start, end) + def count(self, sub, start=None, end=None): return zbytes.count(self, _pyb_coerce(sub), start, end) def endswith(self, suffix, start=None, end=None): if isinstance(suffix, tuple): @@ -475,13 +483,13 @@ cdef class pybstr(bytes): # https://github.com/cython/cython/issues/711 return False if start is None: start = 0 if end is None: end = PY_SSIZE_T_MAX - return bytes.endswith(self, _pyb_coerce(suffix), start, end) + return zbytes.endswith(self, _pyb_coerce(suffix), start, end) def expandtabs(self, tabsize=8): return pyb(pyu(self).expandtabs(tabsize)) # NOTE find/index & friends should return byte-position, not unicode-position - def find(self, sub, start=None, end=None): return bytes.find(self, _pyb_coerce(sub), start, end) - def index(self, sub, start=None, end=None): return bytes.index(self, _pyb_coerce(sub), start, end) + def find(self, sub, start=None, end=None): return zbytes.find(self, _pyb_coerce(sub), start, end) + def index(self, sub, start=None, end=None): return zbytes.index(self, _pyb_coerce(sub), start, end) def isalnum(self): return pyu(self).isalnum() def isalpha(self): return pyu(self).isalpha() @@ -495,21 +503,21 @@ cdef class pybstr(bytes): # https://github.com/cython/cython/issues/711 def isspace(self): return pyu(self).isspace() def istitle(self): return pyu(self).istitle() - def join(self, iterable): return pyb(bytes.join(self, (_pyb_coerce(_) for _ in iterable))) + def join(self, iterable): return pyb(zbytes.join(self, (_pyb_coerce(_) for _ in iterable))) def ljust(self, width, fillchar=' '): return pyb(pyu(self).ljust(width, fillchar)) def lower(self): return pyb(pyu(self).lower()) def lstrip(self, chars=None): return pyb(pyu(self).lstrip(chars)) - def partition(self, sep): return tuple(pyb(_) for _ in bytes.partition(self, _pyb_coerce(sep))) + def partition(self, sep): return tuple(pyb(_) for _ in zbytes.partition(self, _pyb_coerce(sep))) def removeprefix(self, prefix): return pyb(pyu(self).removeprefix(prefix)) def removesuffix(self, suffix): return pyb(pyu(self).removesuffix(suffix)) - def replace(self, old, new, count=-1): return pyb(bytes.replace(self, _pyb_coerce(old), _pyb_coerce(new), count)) + def replace(self, old, new, count=-1): return pyb(zbytes.replace(self, _pyb_coerce(old), _pyb_coerce(new), count)) # NOTE rfind/rindex & friends should return byte-position, not unicode-position - def rfind(self, sub, start=None, end=None): return bytes.rfind(self, _pyb_coerce(sub), start, end) - def rindex(self, sub, start=None, end=None): return bytes.rindex(self, _pyb_coerce(sub), start, end) + def rfind(self, sub, start=None, end=None): return zbytes.rfind(self, _pyb_coerce(sub), start, end) + def rindex(self, sub, start=None, end=None): return zbytes.rindex(self, _pyb_coerce(sub), start, end) def rjust(self, width, fillchar=' '): return pyb(pyu(self).rjust(width, fillchar)) - def rpartition(self, sep): return tuple(pyb(_) for _ in bytes.rpartition(self, _pyb_coerce(sep))) + def rpartition(self, sep): return tuple(pyb(_) for _ in zbytes.rpartition(self, _pyb_coerce(sep))) def rsplit(self, sep=None, maxsplit=-1): v = pyu(self).rsplit(sep, maxsplit) return list([pyb(_) for _ in v]) @@ -527,16 +535,16 @@ cdef class pybstr(bytes): # https://github.com/cython/cython/issues/711 return False if start is None: start = 0 if end is None: end = PY_SSIZE_T_MAX - return bytes.startswith(self, _pyb_coerce(prefix), start, end) + return zbytes.startswith(self, _pyb_coerce(prefix), start, end) def strip(self, chars=None): return pyb(pyu(self).strip(chars)) def swapcase(self): return pyb(pyu(self).swapcase()) def title(self): return pyb(pyu(self).title()) def translate(self, table, delete=None): # bytes mode (compatibility with str/py2) - if table is None or isinstance(table, bytes) or delete is not None: + if table is None or isinstance(table, zbytes) or delete is not None: if delete is None: delete = b'' - return pyb(bytes.translate(self, table, delete)) + return pyb(zbytes.translate(self, table, delete)) # unicode mode else: return pyb(pyu(self).translate(table)) @@ -566,8 +574,8 @@ cdef PyObject* _pybstr_tp_new(PyTypeObject* _cls, PyObject* _argv, PyObject* _kw # bytes uses "optimized" and custom .tp_basicsize and .tp_itemsize: # https://github.com/python/cpython/blob/v2.7.18-0-g8d21aa21f2c/Objects/stringobject.c#L26-L32 # https://github.com/python/cpython/blob/v2.7.18-0-g8d21aa21f2c/Objects/stringobject.c#L3816-L3820 -(pybstr) .tp_basicsize = (bytes).tp_basicsize -(pybstr) .tp_itemsize = (bytes).tp_itemsize +(pybstr) .tp_basicsize = (zbytes).tp_basicsize +(pybstr) .tp_itemsize = (zbytes).tp_itemsize # make sure pybstr C layout corresponds to bytes C layout exactly # we patched cython to allow from-bytes cdef class inheritance and we also set @@ -646,7 +654,7 @@ cdef class pyustr(unicode): # retrieve state, which gives ustr, not unicode. Fix state to be unicode ourselves. def __reduce_ex__(self, protocol): if protocol >= 2: - return unicode.__reduce_ex__(self, protocol) + return zunicode.__reduce_ex__(self, protocol) return ( pycopyreg._reconstructor, (self.__class__, self.__class__, _udata(self)) @@ -656,7 +664,7 @@ cdef class pyustr(unicode): def __hash__(self): # see pybstr.__hash__ for why we stick to hash of current str if PY_MAJOR_VERSION >= 3: - return unicode.__hash__(self) + return zunicode.__hash__(self) else: return hash(pyb(self)) @@ -668,23 +676,23 @@ cdef class pyustr(unicode): b = _pyu_coerce(b) except TypeError: return False - return unicode.__eq__(a, b) + return zunicode.__eq__(a, b) def __ne__(a, b): return not a.__eq__(b) - def __lt__(a, b): return unicode.__lt__(a, _pyu_coerce(b)) - def __gt__(a, b): return unicode.__gt__(a, _pyu_coerce(b)) - def __le__(a, b): return unicode.__le__(a, _pyu_coerce(b)) - def __ge__(a, b): return unicode.__ge__(a, _pyu_coerce(b)) + def __lt__(a, b): return zunicode.__lt__(a, _pyu_coerce(b)) + def __gt__(a, b): return zunicode.__gt__(a, _pyu_coerce(b)) + def __le__(a, b): return zunicode.__le__(a, _pyu_coerce(b)) + def __ge__(a, b): return zunicode.__ge__(a, _pyu_coerce(b)) # len - no need to override # [], [:] def __getitem__(self, idx): - return pyu(unicode.__getitem__(self, idx)) + return pyu(zunicode.__getitem__(self, idx)) # __iter__ def __iter__(self): if PY_MAJOR_VERSION >= 3: - return _pyustrIter(unicode.__iter__(self)) + return _pyustrIter(zunicode.__iter__(self)) else: # on python 2 unicode does not have .__iter__ return PySeqIter_New(self) @@ -692,7 +700,7 @@ cdef class pyustr(unicode): # __contains__ def __contains__(self, key): - return unicode.__contains__(self, _pyu_coerce(key)) + return zunicode.__contains__(self, _pyu_coerce(key)) # __add__, __radd__ (no need to override __iadd__) @@ -703,7 +711,7 @@ cdef class pyustr(unicode): if type(a) is not pyustr: assert type(b) is pyustr return b.__radd__(a) - return pyu(unicode.__add__(a, _pyu_coerce(b))) + return pyu(zunicode.__add__(a, _pyu_coerce(b))) def __radd__(b, a): # a.__add__(b) returned NotImplementedError, e.g. for unicode.__add__(bstr) @@ -723,7 +731,7 @@ cdef class pyustr(unicode): if type(a) is not pyustr: assert type(b) is pyustr return b.__rmul__(a) - return pyu(unicode.__mul__(a, b)) + return pyu(zunicode.__mul__(a, b)) def __rmul__(b, a): return b.__mul__(a) @@ -748,7 +756,7 @@ cdef class pyustr(unicode): # NOTE not e.g. `_bvformat(_pyu_coerce(format_spec), (self,))` because # the only format code that string.__format__ should support is # 's', not e.g. 'r'. - return pyu(unicode.__format__(self, format_spec)) + return pyu(zunicode.__format__(self, format_spec)) # encode/decode @@ -763,7 +771,7 @@ cdef class pyustr(unicode): if encoding == 'utf-8' and errors == 'surrogateescape': x = _utf8_encode_surrogateescape(self) else: - x = unicode.encode(self, encoding, errors) + x = zunicode.encode(self, encoding, errors) return pyb(x) if PY_MAJOR_VERSION < 3: @@ -774,15 +782,15 @@ cdef class pyustr(unicode): # all other string methods - def capitalize(self): return pyu(unicode.capitalize(self)) - def casefold(self): return pyu(unicode.casefold(self)) - def center(self, width, fillchar=' '): return pyu(unicode.center(self, width, _pyu_coerce(fillchar))) + def capitalize(self): return pyu(zunicode.capitalize(self)) + def casefold(self): return pyu(zunicode.casefold(self)) + def center(self, width, fillchar=' '): return pyu(zunicode.center(self, width, _pyu_coerce(fillchar))) def count(self, sub, start=None, end=None): # cython optimizes unicode.count to directly call PyUnicode_Count - # - cannot use None for start/stop https://github.com/cython/cython/issues/4737 if start is None: start = 0 if end is None: end = PY_SSIZE_T_MAX - return unicode.count(self, _pyu_coerce(sub), start, end) + return zunicode.count(self, _pyu_coerce(sub), start, end) def endswith(self, suffix, start=None, end=None): if isinstance(suffix, tuple): for _ in suffix: @@ -791,16 +799,16 @@ cdef class pyustr(unicode): return False if start is None: start = 0 if end is None: end = PY_SSIZE_T_MAX - return unicode.endswith(self, _pyu_coerce(suffix), start, end) - def expandtabs(self, tabsize=8): return pyu(unicode.expandtabs(self, tabsize)) + return zunicode.endswith(self, _pyu_coerce(suffix), start, end) + def expandtabs(self, tabsize=8): return pyu(zunicode.expandtabs(self, tabsize)) def find(self, sub, start=None, end=None): if start is None: start = 0 if end is None: end = PY_SSIZE_T_MAX - return unicode.find(self, _pyu_coerce(sub), start, end) + return zunicode.find(self, _pyu_coerce(sub), start, end) def index(self, sub, start=None, end=None): if start is None: start = 0 if end is None: end = PY_SSIZE_T_MAX - return unicode.index(self, _pyu_coerce(sub), start, end) + return zunicode.index(self, _pyu_coerce(sub), start, end) # isalnum(self) no need to override # isalpha(self) no need to override @@ -814,41 +822,41 @@ cdef class pyustr(unicode): # isspace(self) no need to override # istitle(self) no need to override - def join(self, iterable): return pyu(unicode.join(self, (_pyu_coerce(_) for _ in iterable))) - def ljust(self, width, fillchar=' '): return pyu(unicode.ljust(self, width, _pyu_coerce(fillchar))) - def lower(self): return pyu(unicode.lower(self)) - def lstrip(self, chars=None): return pyu(unicode.lstrip(self, _xpyu_coerce(chars))) - def partition(self, sep): return tuple(pyu(_) for _ in unicode.partition(self, _pyu_coerce(sep))) - def removeprefix(self, prefix): return pyu(unicode.removeprefix(self, _pyu_coerce(prefix))) - def removesuffix(self, suffix): return pyu(unicode.removesuffix(self, _pyu_coerce(suffix))) - def replace(self, old, new, count=-1): return pyu(unicode.replace(self, _pyu_coerce(old), _pyu_coerce(new), count)) + def join(self, iterable): return pyu(zunicode.join(self, (_pyu_coerce(_) for _ in iterable))) + def ljust(self, width, fillchar=' '): return pyu(zunicode.ljust(self, width, _pyu_coerce(fillchar))) + def lower(self): return pyu(zunicode.lower(self)) + def lstrip(self, chars=None): return pyu(zunicode.lstrip(self, _xpyu_coerce(chars))) + def partition(self, sep): return tuple(pyu(_) for _ in zunicode.partition(self, _pyu_coerce(sep))) + def removeprefix(self, prefix): return pyu(zunicode.removeprefix(self, _pyu_coerce(prefix))) + def removesuffix(self, suffix): return pyu(zunicode.removesuffix(self, _pyu_coerce(suffix))) + def replace(self, old, new, count=-1): return pyu(zunicode.replace(self, _pyu_coerce(old), _pyu_coerce(new), count)) def rfind(self, sub, start=None, end=None): if start is None: start = 0 if end is None: end = PY_SSIZE_T_MAX - return unicode.rfind(self, _pyu_coerce(sub), start, end) + return zunicode.rfind(self, _pyu_coerce(sub), start, end) def rindex(self, sub, start=None, end=None): if start is None: start = 0 if end is None: end = PY_SSIZE_T_MAX - return unicode.rindex(self, _pyu_coerce(sub), start, end) - def rjust(self, width, fillchar=' '): return pyu(unicode.rjust(self, width, _pyu_coerce(fillchar))) - def rpartition(self, sep): return tuple(pyu(_) for _ in unicode.rpartition(self, _pyu_coerce(sep))) + return zunicode.rindex(self, _pyu_coerce(sub), start, end) + def rjust(self, width, fillchar=' '): return pyu(zunicode.rjust(self, width, _pyu_coerce(fillchar))) + def rpartition(self, sep): return tuple(pyu(_) for _ in zunicode.rpartition(self, _pyu_coerce(sep))) def rsplit(self, sep=None, maxsplit=-1): - v = unicode.rsplit(self, _xpyu_coerce(sep), maxsplit) + v = zunicode.rsplit(self, _xpyu_coerce(sep), maxsplit) return list([pyu(_) for _ in v]) - def rstrip(self, chars=None): return pyu(unicode.rstrip(self, _xpyu_coerce(chars))) + def rstrip(self, chars=None): return pyu(zunicode.rstrip(self, _xpyu_coerce(chars))) def split(self, sep=None, maxsplit=-1): # cython optimizes unicode.split to directly call PyUnicode_Split - cannot use None for sep # and cannot also use object=NULL https://github.com/cython/cython/issues/4737 if sep is None: if PY_MAJOR_VERSION >= 3: - v = unicode.split(self, maxsplit=maxsplit) + v = zunicode.split(self, maxsplit=maxsplit) else: # on py2 unicode.split does not accept keyword arguments - v = _udata(self).split(None, maxsplit) + v = zunicode.split(self, None, maxsplit) else: - v = unicode.split(self, _pyu_coerce(sep), maxsplit) + v = zunicode.split(self, _pyu_coerce(sep), maxsplit) return list([pyu(_) for _ in v]) - def splitlines(self, keepends=False): return list(pyu(_) for _ in unicode.splitlines(self, keepends)) + def splitlines(self, keepends=False): return list(pyu(_) for _ in zunicode.splitlines(self, keepends)) def startswith(self, prefix, start=None, end=None): if isinstance(prefix, tuple): for _ in prefix: @@ -857,10 +865,10 @@ cdef class pyustr(unicode): return False if start is None: start = 0 if end is None: end = PY_SSIZE_T_MAX - return unicode.startswith(self, _pyu_coerce(prefix), start, end) - def strip(self, chars=None): return pyu(unicode.strip(self, _xpyu_coerce(chars))) - def swapcase(self): return pyu(unicode.swapcase(self)) - def title(self): return pyu(unicode.title(self)) + return zunicode.startswith(self, _pyu_coerce(prefix), start, end) + def strip(self, chars=None): return pyu(zunicode.strip(self, _xpyu_coerce(chars))) + def swapcase(self): return pyu(zunicode.swapcase(self)) + def title(self): return pyu(zunicode.title(self)) def translate(self, table): # unicode.translate does not accept bstr values @@ -869,10 +877,10 @@ cdef class pyustr(unicode): if not isinstance(v, int): # either unicode ordinal, v = _xpyu_coerce(v) # character or None t[k] = v - return pyu(unicode.translate(self, t)) + return pyu(zunicode.translate(self, t)) - def upper(self): return pyu(unicode.upper(self)) - def zfill(self, width): return pyu(unicode.zfill(self, width)) + def upper(self): return pyu(zunicode.upper(self)) + def zfill(self, width): return pyu(zunicode.zfill(self, width)) @staticmethod def maketrans(x=None, y=None, z=None): @@ -884,11 +892,11 @@ cdef class pyustr(unicode): if not isinstance(k, int): k = pyu(k) _[k] = v - return unicode.maketrans(_) + return zunicode.maketrans(_) elif z is None: - return unicode.maketrans(pyu(x), pyu(y)) # std maketrans does not accept b + return zunicode.maketrans(pyu(x), pyu(y)) # std maketrans does not accept b else: - return unicode.maketrans(pyu(x), pyu(y), pyu(z)) # ----//---- + return zunicode.maketrans(pyu(x), pyu(y), pyu(z)) # ----//---- # hand-made on py2 t = {} @@ -994,7 +1002,7 @@ IF PY2: assert isinstance(o, bytes) o = o o = bytes(buffer(o)) # change tp_type to bytes instead of pybstr - return (<_PyTypeObject_Print*>Py_TYPE(o)) .tp_print(o, f, Py_PRINT_RAW) + return (<_PyTypeObject_Print*>zbytes) .tp_print(o, f, Py_PRINT_RAW) (<_PyTypeObject_Print*>Py_TYPE(pybstr())) .tp_print = _pybstr_tp_print @@ -1691,7 +1699,7 @@ cdef _bprintf(const uint8_t[::1] fmt, xarg): # -> pybstr #print('--> __mod__ ', repr(fmt1), ' % ', repr(arg)) try: - s = bytes.__mod__(fmt1, arg) + s = zbytes.__mod__(fmt1, arg) except ValueError as e: # adjust position in '... at index ' from fmt1 to fmt if len(e.args) == 1: @@ -1945,7 +1953,7 @@ def _utf8_decode_surrogateescape(const uint8_t[::1] s): # -> unicode def _utf8_encode_surrogateescape(s): # -> bytes assert isinstance(s, unicode) if PY_MAJOR_VERSION >= 3: - return unicode.encode(s, 'UTF-8', 'surrogateescape') + return zunicode.encode(s, 'UTF-8', 'surrogateescape') # py2 does not have surrogateescape error handler, and even if we # provide one, builtin unicode.encode() does not treat From baf84437527d32325bea3c59f504a41b341daf42 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 1 May 2023 19:46:26 +0300 Subject: [PATCH 11/29] golang_str: pybstr -> _pybstr ; pyustr -> _pyustr And let pybstr/pyustr point to version of bstr/ustr types that is actually in use: - when bytes/unicode are not patched -> to _pybstr/_pyustr - when bytes/unicode will be patched -> to bytes/unicode to where original _pybstr/_pyustr were copied during bytes/unicode patching. at runtime the code uses pybstr/pyustr instead of _pybstr/_pyustr. --- golang/_golang_str.pyx | 52 ++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index fae05e8..c52e71b 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -87,6 +87,14 @@ else: cdef object zbytes = (&PyBytes_Type) cdef object zunicode = (&PyUnicode_Type) +# pybstr/pyustr point to version of bstr/ustr types that is actually in use: +# - when bytes/unicode are not patched -> to _pybstr/_pyustr +# - when bytes/unicode will be patched -> to bytes/unicode to where original +# _pybstr/_pyustr were copied during bytes/unicode patching. +# at runtime the code should use pybstr/pyustr instead of _pybstr/_pyustr. +pybstr = _pybstr # initially point to -> _pybstr/_pyustr +pyustr = _pyustr # TODO -> cdef for speed + def pyb(s): # -> bstr """b converts object to bstr. @@ -250,8 +258,8 @@ def pyuchr(int i): # -> 1-character ustr return pyu(unichr(i)) -@no_gc # note setup.py assist this to compile despite -cdef class pybstr(bytes): # https://github.com/cython/cython/issues/711 +@no_gc # note setup.py assist this to compile despite +cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 """bstr is byte-string. It is based on bytes and can automatically convert to/from unicode. @@ -284,7 +292,7 @@ cdef class pybstr(bytes): # https://github.com/cython/cython/issues/711 """ # XXX due to "cannot `cdef class` with __new__" (https://github.com/cython/cython/issues/799) - # pybstr.__new__ is hand-made in _pybstr_tp_new which invokes ↓ .____new__() . + # _pybstr.__new__ is hand-made in _pybstr_tp_new which invokes ↓ .____new__() . @staticmethod def ____new__(cls, object='', encoding=None, errors=None): # encoding or errors -> object must expose buffer interface @@ -557,7 +565,7 @@ cdef class pybstr(bytes): # https://github.com/cython/cython/issues/711 return pyustr.maketrans(x, y, z) -# hand-made pybstr.__new__ (workaround for https://github.com/cython/cython/issues/799) +# hand-made _pybstr.__new__ (workaround for https://github.com/cython/cython/issues/799) cdef PyObject* _pybstr_tp_new(PyTypeObject* _cls, PyObject* _argv, PyObject* _kw) except NULL: argv = () if _argv != NULL: @@ -566,26 +574,26 @@ cdef PyObject* _pybstr_tp_new(PyTypeObject* _cls, PyObject* _argv, PyObject* _kw if _kw != NULL: kw = _kw - cdef object x = pybstr.____new__(_cls, *argv, **kw) + cdef object x = _pybstr.____new__(_cls, *argv, **kw) Py_INCREF(x) return x -(<_XPyTypeObject*>pybstr).tp_new = &_pybstr_tp_new +(<_XPyTypeObject*>_pybstr).tp_new = &_pybstr_tp_new # bytes uses "optimized" and custom .tp_basicsize and .tp_itemsize: # https://github.com/python/cpython/blob/v2.7.18-0-g8d21aa21f2c/Objects/stringobject.c#L26-L32 # https://github.com/python/cpython/blob/v2.7.18-0-g8d21aa21f2c/Objects/stringobject.c#L3816-L3820 -(pybstr) .tp_basicsize = (zbytes).tp_basicsize -(pybstr) .tp_itemsize = (zbytes).tp_itemsize +(_pybstr) .tp_basicsize = (zbytes).tp_basicsize +(_pybstr) .tp_itemsize = (zbytes).tp_itemsize -# make sure pybstr C layout corresponds to bytes C layout exactly +# make sure _pybstr C layout corresponds to bytes C layout exactly # we patched cython to allow from-bytes cdef class inheritance and we also set -# .tp_basicsize directly above. All this works ok only if C layouts for pybstr +# .tp_basicsize directly above. All this works ok only if C layouts for _pybstr # and bytes are completely the same. -assert sizeof(pybstr) == sizeof(PyBytesObject) +assert sizeof(_pybstr) == sizeof(PyBytesObject) @no_gc -cdef class pyustr(unicode): +cdef class _pyustr(unicode): """ustr is unicode-string. It is based on unicode and can automatically convert to/from bytes. @@ -613,7 +621,7 @@ cdef class pyustr(unicode): """ # XXX due to "cannot `cdef class` with __new__" (https://github.com/cython/cython/issues/799) - # pyustr.__new__ is hand-made in _pyustr_tp_new which invokes ↓ .____new__() . + # _pyustr.__new__ is hand-made in _pyustr_tp_new which invokes ↓ .____new__() . @staticmethod def ____new__(cls, object='', encoding=None, errors=None): # encoding or errors -> object must expose buffer interface @@ -662,7 +670,7 @@ cdef class pyustr(unicode): def __hash__(self): - # see pybstr.__hash__ for why we stick to hash of current str + # see _pybstr.__hash__ for why we stick to hash of current str if PY_MAJOR_VERSION >= 3: return zunicode.__hash__(self) else: @@ -921,7 +929,7 @@ cdef class pyustr(unicode): return t -# hand-made pyustr.__new__ (workaround for https://github.com/cython/cython/issues/799) +# hand-made _pyustr.__new__ (workaround for https://github.com/cython/cython/issues/799) cdef PyObject* _pyustr_tp_new(PyTypeObject* _cls, PyObject* _argv, PyObject* _kw) except NULL: argv = () if _argv != NULL: @@ -930,13 +938,13 @@ cdef PyObject* _pyustr_tp_new(PyTypeObject* _cls, PyObject* _argv, PyObject* _kw if _kw != NULL: kw = _kw - cdef object x = pyustr.____new__(_cls, *argv, **kw) + cdef object x = _pyustr.____new__(_cls, *argv, **kw) Py_INCREF(x) return x -(<_XPyTypeObject*>pyustr).tp_new = &_pyustr_tp_new +(<_XPyTypeObject*>_pyustr).tp_new = &_pyustr_tp_new -# similarly to bytes - want same C layout for pyustr vs unicode -assert sizeof(pyustr) == sizeof(PyUnicodeObject) +# similarly to bytes - want same C layout for _pyustr vs unicode +assert sizeof(_pyustr) == sizeof(PyUnicodeObject) # _pyustrIter wraps unicode iterator to return pyustr for each yielded character. @@ -1004,7 +1012,7 @@ IF PY2: o = bytes(buffer(o)) # change tp_type to bytes instead of pybstr return (<_PyTypeObject_Print*>zbytes) .tp_print(o, f, Py_PRINT_RAW) - (<_PyTypeObject_Print*>Py_TYPE(pybstr())) .tp_print = _pybstr_tp_print + (<_PyTypeObject_Print*>Py_TYPE(_pybstr())) .tp_print = _pybstr_tp_print # whiteout .sq_slice for pybstr/pyustr inherited from str/unicode. @@ -1012,8 +1020,8 @@ IF PY2: # If we don't do this e.g. bstr[:] will be handled by str.__getslice__ instead # of bstr.__getitem__, and will return str instead of bstr. if PY2: - (<_XPyTypeObject*>pybstr) .tp_as_sequence.sq_slice = NULL - (<_XPyTypeObject*>pyustr) .tp_as_sequence.sq_slice = NULL + (<_XPyTypeObject*>_pybstr) .tp_as_sequence.sq_slice = NULL + (<_XPyTypeObject*>_pyustr) .tp_as_sequence.sq_slice = NULL # ---- adjust bstr/ustr classes after what cython generated ---- From 90f0e0ff69ef261040e61573efb5bdd1fa91d2e6 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Fri, 23 Jun 2023 17:54:03 +0300 Subject: [PATCH 12/29] strconv: Add benchmarks for quote and unquote MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This functions are currently relatively slow. They were initially used in zodbdump and zodbrestore, where their speed did not matter much, but with bstr and ustr, since e.g. quote is used in repr, not having them to perform with speed similar to builtin string escaping starts to be an issue. Tatuya Kamada reports at https://lab.nexedi.com/nexedi/pygolang/merge_requests/21#note_170833 : ### 3. `u` seems slow with large arrays especially when `repr` it I have faced a slowness while testing `u`, `b` with python 2.7, especially with `repr`. ```python >>> timeit.timeit("from golang import b,u; u('あ'*199998)", number=10) 2.02020001411438 >>> timeit.timeit("from golang import b,u; repr(u('あ'*199998))", number=10) 54.60263395309448 ``` `bytes`(str) is very fast. ```python >>> timeit.timeit("from golang import b,u; bytes('あ'*199998)", number=10) 0.000392913818359375 >>> timeit.timeit("from golang import b,u; repr(bytes('あ'*199998))", number=10) 0.4604980945587158 ``` `b` is much faster than `u`, but still the repr seems slow. ``` >>> timeit.timeit("from golang import b,u; b('あ'*199998)", number=10) 0.0009968280792236328 >>> timeit.timeit("from golang import b,u; repr(b('あ'*199998))", number=10) 25.498882055282593 ``` The "repr" part of this problem is due to that both bstr.__repr__ and ustr.__repr__ use custom quoting routines which currently are implemented in pure python in strconv module: https://lab.nexedi.com/kirr/pygolang/blob/300d7dfa/golang/_golang_str.pyx#L282-291 https://lab.nexedi.com/kirr/pygolang/blob/300d7dfa/golang/_golang_str.pyx#L582-591 https://lab.nexedi.com/kirr/pygolang/blob/300d7dfa/golang/_golang_str.pyx#L941-970 https://lab.nexedi.com/kirr/pygolang/blob/300d7dfa/golang/strconv.py#L31-92 The fix would be to move strconv.py to Cython and to correspondingly rework it to avoid using python-level constructs during quoting internally. Working on that was not a priority, but soon I will need to move strconv to Cython for another reason: to be able to break import cycle in between _golang and strconv. So it makes sense to add strconv benchmark first - since we'll start moving it to Cython anyway - to see where we are and how further changes will help performance-wise. Currently we are at name time/op quote[a] 910µs ± 0% quote[\u03b1] 1.23ms ± 0% quote[\u65e5] 800µs ± 0% quote[\U0001f64f] 1.06ms ± 1% stdquote 1.17µs ± 0% unquote[a] 1.33ms ± 1% unquote[\u03b1] 952µs ± 2% unquote[\u65e5] 613µs ± 2% unquote[\U0001f64f] 3.62ms ± 1% stdunquote 788ns ± 0% i.e. on py2 quoting is ~ 1000x slower than builtin string escaping, and unquoting is even slower. on py3 the situation is better, but still not good: name time/op quote[a] 579µs ± 1% quote[\u03b1] 942µs ± 1% quote[\u65e5] 595µs ± 0% quote[\U0001f64f] 274µs ± 1% stdquote 2.70µs ± 0% unquote[a] 696µs ± 1% unquote[\u03b1] 763µs ± 0% unquote[\u65e5] 474µs ± 1% unquote[\U0001f64f] 187µs ± 0% stdunquote 808ns ± 0% δ(py2, py3) for the reference: name py2 time/op py3 time/op delta quote[a] 910µs ± 0% 579µs ± 1% -36.42% (p=0.008 n=5+5) quote[\u03b1] 1.23ms ± 0% 0.94ms ± 1% -23.17% (p=0.008 n=5+5) quote[\u65e5] 800µs ± 0% 595µs ± 0% -25.63% (p=0.016 n=4+5) quote[\U0001f64f] 1.06ms ± 1% 0.27ms ± 1% -74.23% (p=0.008 n=5+5) stdquote 1.17µs ± 0% 2.70µs ± 0% +129.71% (p=0.008 n=5+5) unquote[a] 1.33ms ± 1% 0.70ms ± 1% -47.71% (p=0.008 n=5+5) unquote[\u03b1] 952µs ± 2% 763µs ± 0% -19.82% (p=0.008 n=5+5) unquote[\u65e5] 613µs ± 2% 474µs ± 1% -22.76% (p=0.008 n=5+5) unquote[\U0001f64f] 3.62ms ± 1% 0.19ms ± 0% -94.84% (p=0.016 n=5+4) stdunquote 788ns ± 0% 808ns ± 0% +2.59% (p=0.016 n=4+5) --- golang/strconv_test.py | 56 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/golang/strconv_test.py b/golang/strconv_test.py index 5dc68c1..31fdca0 100644 --- a/golang/strconv_test.py +++ b/golang/strconv_test.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2022 Nexedi SA and Contributors. +# Copyright (C) 2018-2023 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -26,7 +26,10 @@ from six import int2byte as bchr from six.moves import range as xrange -from pytest import raises +from pytest import raises, mark + +import codecs + def byterange(start, stop): b = b"" @@ -138,3 +141,52 @@ def test_unquote_bad(): with raises(ValueError) as exc: unquote(tin) assert exc.value.args == (err,) + + +# ---- benchmarks ---- + +# quoting + unquoting +uchar_testv = ['a', # ascii + u'α', # 2-bytes utf8 + u'\u65e5', # 3-bytes utf8 + u'\U0001f64f'] # 4-bytes utf8 + +@mark.parametrize('ch', uchar_testv) +def bench_quote(b, ch): + s = bstr_ch1000(ch) + q = quote + for i in xrange(b.N): + q(s) + +def bench_stdquote(b): + s = b'a'*1000 + q = repr + for i in xrange(b.N): + q(s) + + +@mark.parametrize('ch', uchar_testv) +def bench_unquote(b, ch): + s = bstr_ch1000(ch) + s = quote(s) + unq = unquote + for i in xrange(b.N): + unq(s) + +def bench_stdunquote(b): + s = b'"' + b'a'*1000 + b'"' + escape_decode = codecs.escape_decode + def unq(s): return escape_decode(s[1:-1])[0] + for i in xrange(b.N): + unq(s) + + +# bstr_ch1000 returns bstr with many repetitions of character ch occupying ~ 1000 bytes. +def bstr_ch1000(ch): # -> bstr + assert len(ch) == 1 + s = bstr(ch) + s = s * (1000 // len(s)) + if len(s) % 3 == 0: + s += 'x' + assert len(s) == 1000 + return s From 83a1da997a752e136c09992fb2f9e5398aa003f3 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 26 Jun 2023 20:55:06 +0300 Subject: [PATCH 13/29] golang, libgolang: Add byte / rune types Those types are the base when working with byte- and unicode strings. It will be clearer to use them explicitly instead of uint8_t and int32_t when processing string. --- golang/_golang.pxd | 5 ++++- golang/libgolang.h | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/golang/_golang.pxd b/golang/_golang.pxd index 41a2953..2e24025 100644 --- a/golang/_golang.pxd +++ b/golang/_golang.pxd @@ -1,5 +1,5 @@ # cython: language_level=2 -# Copyright (C) 2019-2022 Nexedi SA and Contributors. +# Copyright (C) 2019-2023 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -65,6 +65,9 @@ cdef extern from *: # on the edge of Python/nogil world. from libcpp.string cimport string # golang::string = std::string cdef extern from "golang/libgolang.h" namespace "golang" nogil: + ctypedef unsigned char byte + ctypedef signed int rune # = int32 + void panic(const char *) const char *recover() diff --git a/golang/libgolang.h b/golang/libgolang.h index 2cd8abe..0d4c153 100644 --- a/golang/libgolang.h +++ b/golang/libgolang.h @@ -433,6 +433,10 @@ constexpr Nil nil = nullptr; // string is alias for std::string. using string = std::string; +// byte/rune types related to string. +using byte = uint8_t; +using rune = int32_t; + // func is alias for std::function. template using func = std::function; From b9d72051a4d4913aa7d064bf95e58fb5793dec77 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 26 Jun 2023 21:01:11 +0300 Subject: [PATCH 14/29] *: uint8_t -> byte, unicode-codepint -> rune We added byte and rune types in the previous patch. Let's use them now throughout whole codebase where appropriate. Currently the only place where unicode-codepoint is used is _utf8_decode_rune. uint8_t was used in many places. --- golang/_golang_str.pyx | 13 ++++++------- golang/runtime/_runtime_gevent.pyx | 8 ++++---- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index c52e71b..3b72ca5 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -68,7 +68,6 @@ cdef extern from "Python.h": from cython cimport no_gc -from libc.stdint cimport uint8_t from libc.stdio cimport FILE pystrconv = None # = golang.strconv imported at runtime (see __init__.py) @@ -1056,7 +1055,7 @@ _bstrustr_remove_unsupported_slots() # NOTE the return type is str type of current python, so that quoted result # could be directly used in __repr__ or __str__ implementation. cdef _bpysmartquote_u3b2(s): # -> (unicode(py3)|bytes(py2), nonascii_escape) - # TODO change to `const uint8_t[::1] s` after strconv._quote is moved to pyx + # TODO change to `const byte[::1] s` after strconv._quote is moved to pyx if isinstance(s, bytearray): s = _bytearray_data(s) assert isinstance(s, bytes), s @@ -1498,7 +1497,7 @@ cdef class _UnboundMethod(object): # they removed unbound methods on py3 # See also overview of patching bytes.{__repr__,__str__} near _bstringify. cdef object _missing = object() cdef object _atidx_re = pyre.compile('.* at index ([0-9]+)$') -cdef _bprintf(const uint8_t[::1] fmt, xarg): # -> pybstr +cdef _bprintf(const byte[::1] fmt, xarg): # -> pybstr cdef bytearray out = bytearray() cdef tuple argv = None # if xarg is tuple @@ -1570,7 +1569,7 @@ cdef _bprintf(const uint8_t[::1] fmt, xarg): # -> pybstr # differently - on b %r is aliased to %a. cdef int i = 0 cdef int l = len(fmt) - cdef uint8_t c + cdef byte c while i < l: c = fmt[i] i += 1 @@ -1883,9 +1882,9 @@ assert _ucs2_build or sys.maxunicode >= 0x0010ffff # or ucs4 # _utf8_decode_rune decodes next UTF8-character from byte string s. # # _utf8_decode_rune(s) -> (r, size) -def _py_utf8_decode_rune(const uint8_t[::1] s): +def _py_utf8_decode_rune(const byte[::1] s): return _utf8_decode_rune(s) -cdef (int, int) _utf8_decode_rune(const uint8_t[::1] s): +cdef (rune, int) _utf8_decode_rune(const byte[::1] s): if len(s) == 0: return _rune_error, 0 @@ -1918,7 +1917,7 @@ cdef (int, int) _utf8_decode_rune(const uint8_t[::1] s): # _utf8_decode_surrogateescape mimics s.decode('utf-8', 'surrogateescape') from py3. -def _utf8_decode_surrogateescape(const uint8_t[::1] s): # -> unicode +def _utf8_decode_surrogateescape(const byte[::1] s): # -> unicode if PY_MAJOR_VERSION >= 3: if len(s) == 0: return u'' # avoid out-of-bounds slice access on &s[0] diff --git a/golang/runtime/_runtime_gevent.pyx b/golang/runtime/_runtime_gevent.pyx index 8c4751d..dcf4f33 100644 --- a/golang/runtime/_runtime_gevent.pyx +++ b/golang/runtime/_runtime_gevent.pyx @@ -40,7 +40,7 @@ ELSE: from gevent import sleep as pygsleep -from libc.stdint cimport uint8_t, uint64_t +from libc.stdint cimport uint64_t from cpython cimport PyObject, Py_INCREF, Py_DECREF from cython cimport final @@ -49,7 +49,7 @@ from golang.runtime._libgolang cimport _libgolang_runtime_ops, _libgolang_sema, from golang.runtime.internal cimport syscall from golang.runtime cimport _runtime_thread from golang.runtime._runtime_pymisc cimport PyExc, pyexc_fetch, pyexc_restore -from golang cimport topyexc +from golang cimport byte, topyexc from libc.stdlib cimport calloc, free from libc.errno cimport EBADF @@ -343,7 +343,7 @@ cdef nogil: cdef: bint _io_read(IOH* ioh, int* out_n, void *buf, size_t count): pygfobj = ioh.pygfobj - cdef uint8_t[::1] mem = buf + cdef byte[::1] mem = buf xmem = memoryview(mem) # to avoid https://github.com/cython/cython/issues/3900 on mem[:0]=b'' try: # NOTE buf might be on stack, so it must not be accessed, e.g. from @@ -380,7 +380,7 @@ cdef nogil: cdef: bint _io_write(IOH* ioh, int* out_n, const void *buf, size_t count): pygfobj = ioh.pygfobj - cdef const uint8_t[::1] mem = buf + cdef const byte[::1] mem = buf # NOTE buf might be on stack, so it must not be accessed, e.g. from # FileObjectThread, while our greenlet is parked (see STACK_DEAD_WHILE_PARKED From 4a022e69403b041aa6eee38ac1244bf354ba9519 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 26 Jun 2023 21:09:34 +0300 Subject: [PATCH 15/29] unicode/utf8: Start of the package (stub) We will soon need to use error rune codepoint from both golang_str.pyx and strconv.pyx - so we need to move that definition into shared place. What fits best is unicode/utf8, so start that package and move the constant there. --- golang/_golang.pyx | 2 +- golang/_golang_str.pyx | 11 ++++++----- golang/pyx/build.py | 3 +++ golang/unicode/__init__.py | 0 golang/unicode/_utf8.pxd | 28 ++++++++++++++++++++++++++++ golang/unicode/utf8.h | 36 ++++++++++++++++++++++++++++++++++++ golang/unicode/utf8.pxd | 26 ++++++++++++++++++++++++++ 7 files changed, 100 insertions(+), 6 deletions(-) create mode 100644 golang/unicode/__init__.py create mode 100644 golang/unicode/_utf8.pxd create mode 100644 golang/unicode/utf8.h create mode 100644 golang/unicode/utf8.pxd diff --git a/golang/_golang.pyx b/golang/_golang.pyx index 5cca599..24f7f23 100644 --- a/golang/_golang.pyx +++ b/golang/_golang.pyx @@ -3,7 +3,7 @@ # cython: binding=False # cython: c_string_type=str, c_string_encoding=utf8 # distutils: language = c++ -# distutils: depends = libgolang.h os/signal.h _golang_str.pyx +# distutils: depends = libgolang.h os/signal.h unicode/utf8.h _golang_str.pyx # # Copyright (C) 2018-2023 Nexedi SA and Contributors. # Kirill Smelkov diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index 3b72ca5..ba127b6 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -22,6 +22,8 @@ It is included from _golang.pyx . """ +from golang.unicode cimport utf8 + from cpython cimport PyUnicode_AsUnicode, PyUnicode_GetSize, PyUnicode_FromUnicode from cpython cimport PyUnicode_DecodeUTF8 from cpython cimport PyTypeObject, Py_TYPE, reprfunc, richcmpfunc, binaryfunc @@ -1873,8 +1875,7 @@ cdef extern from "Python.h": from six import unichr # py2: unichr py3: chr from six import int2byte as bchr # py2: chr py3: lambda x: bytes((x,)) -cdef int _rune_error = 0xFFFD # unicode replacement character -_py_rune_error = _rune_error +_py_rune_error = utf8.RuneError cdef bint _ucs2_build = (sys.maxunicode == 0xffff) # ucs2 assert _ucs2_build or sys.maxunicode >= 0x0010ffff # or ucs4 @@ -1886,7 +1887,7 @@ def _py_utf8_decode_rune(const byte[::1] s): return _utf8_decode_rune(s) cdef (rune, int) _utf8_decode_rune(const byte[::1] s): if len(s) == 0: - return _rune_error, 0 + return utf8.RuneError, 0 cdef int l = min(len(s), 4) # max size of an UTF-8 encoded character while l > 0: @@ -1913,7 +1914,7 @@ cdef (rune, int) _utf8_decode_rune(const byte[::1] s): continue # invalid UTF-8 - return _rune_error, 1 + return utf8.RuneError, 1 # _utf8_decode_surrogateescape mimics s.decode('utf-8', 'surrogateescape') from py3. @@ -1932,7 +1933,7 @@ def _utf8_decode_surrogateescape(const byte[::1] s): # -> unicode while len(s) > 0: r, width = _utf8_decode_rune(s) - if r == _rune_error and width == 1: + if r == utf8.RuneError and width == 1: b = s[0] assert 0x80 <= b <= 0xff, b emit(unichr(0xdc00 + b)) diff --git a/golang/pyx/build.py b/golang/pyx/build.py index 48f40ab..95e0b17 100644 --- a/golang/pyx/build.py +++ b/golang/pyx/build.py @@ -226,6 +226,7 @@ def _with_build_defaults(name, kw): # -> (pygo, kw') 'os.h', 'os/signal.h', 'pyx/runtime.h', + 'unicode/utf8.h', '_testing.h', '_compat/windows/strings.h', '_compat/windows/unistd.h', @@ -274,6 +275,8 @@ def Extension(name, sources, **kw): 'os/signal.pxd', 'os/_signal.pxd', 'pyx/runtime.pxd', + 'unicode/utf8.pxd', + 'unicode/_utf8.pxd', ]]) kw['depends'] = dependv diff --git a/golang/unicode/__init__.py b/golang/unicode/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/golang/unicode/_utf8.pxd b/golang/unicode/_utf8.pxd new file mode 100644 index 0000000..523db10 --- /dev/null +++ b/golang/unicode/_utf8.pxd @@ -0,0 +1,28 @@ +# cython: language_level=2 +# Copyright (C) 2023 Nexedi SA and Contributors. +# Kirill Smelkov +# +# This program is free software: you can Use, Study, Modify and Redistribute +# it under the terms of the GNU General Public License version 3, or (at your +# option) any later version, as published by the Free Software Foundation. +# +# You can also Link and Combine this program with other software covered by +# the terms of any of the Free Software licenses or any of the Open Source +# Initiative approved licenses and Convey the resulting work. Corresponding +# source of such a combination shall include the source code for all other +# software used. +# +# This program is distributed WITHOUT ANY WARRANTY; without even the implied +# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# See COPYING file for full licensing terms. +# See https://www.nexedi.com/licensing for rationale and options. +"""Package utf8 mirrors Go package utf8. + +See https://golang.org/pkg/unicode/utf8 for Go utf8 package documentation. +""" + +from golang cimport rune + +cdef extern from "golang/unicode/utf8.h" namespace "golang::unicode::utf8" nogil: + rune RuneError diff --git a/golang/unicode/utf8.h b/golang/unicode/utf8.h new file mode 100644 index 0000000..c43af31 --- /dev/null +++ b/golang/unicode/utf8.h @@ -0,0 +1,36 @@ +#ifndef _NXD_LIBGOLANG_UNICODE_UTF8_H +#define _NXD_LIBGOLANG_UNICODE_UTF8_H + +// Copyright (C) 2023 Nexedi SA and Contributors. +// Kirill Smelkov +// +// This program is free software: you can Use, Study, Modify and Redistribute +// it under the terms of the GNU General Public License version 3, or (at your +// option) any later version, as published by the Free Software Foundation. +// +// You can also Link and Combine this program with other software covered by +// the terms of any of the Free Software licenses or any of the Open Source +// Initiative approved licenses and Convey the resulting work. Corresponding +// source of such a combination shall include the source code for all other +// software used. +// +// This program is distributed WITHOUT ANY WARRANTY; without even the implied +// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// See COPYING file for full licensing terms. +// See https://www.nexedi.com/licensing for rationale and options. + +// Package utf8 mirrors Go package utf8. + +#include + +// golang::unicode::utf8:: +namespace golang { +namespace unicode { +namespace utf8 { + +constexpr rune RuneError = 0xFFFD; // unicode replacement character + +}}} // golang::os::utf8:: + +#endif // _NXD_LIBGOLANG_UNICODE_UTF8_H diff --git a/golang/unicode/utf8.pxd b/golang/unicode/utf8.pxd new file mode 100644 index 0000000..6ba154a --- /dev/null +++ b/golang/unicode/utf8.pxd @@ -0,0 +1,26 @@ +# cython: language_level=2 +# Copyright (C) 2023 Nexedi SA and Contributors. +# Kirill Smelkov +# +# This program is free software: you can Use, Study, Modify and Redistribute +# it under the terms of the GNU General Public License version 3, or (at your +# option) any later version, as published by the Free Software Foundation. +# +# You can also Link and Combine this program with other software covered by +# the terms of any of the Free Software licenses or any of the Open Source +# Initiative approved licenses and Convey the resulting work. Corresponding +# source of such a combination shall include the source code for all other +# software used. +# +# This program is distributed WITHOUT ANY WARRANTY; without even the implied +# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# See COPYING file for full licensing terms. +# See https://www.nexedi.com/licensing for rationale and options. +"""Package utf8 mirrors Go package utf8. + +See _utf8.pxd for package documentation. +""" + +# redirect cimport: golang.unicode.utf8 -> golang.unicode._utf8 (see __init__.pxd for rationale) +from golang.unicode._utf8 cimport * From ca559325f0d213209b2db509bf65f94770617b6c Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 26 Jun 2023 21:30:06 +0300 Subject: [PATCH 16/29] strconv: Move it to pyx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So far this is plain code movement with no type annotations added and internal from-strconv imports still being done via py level. As expected this does not help practically for performance yet: name old time/op new time/op delta quote[a] 910µs ± 0% 805µs ± 0% -11.54% (p=0.008 n=5+5) quote[\u03b1] 1.23ms ± 0% 1.21ms ± 0% -1.24% (p=0.008 n=5+5) quote[\u65e5] 800µs ± 0% 785µs ± 0% -1.86% (p=0.016 n=4+5) quote[\U0001f64f] 1.06ms ± 1% 1.04ms ± 0% -1.92% (p=0.008 n=5+5) stdquote 1.17µs ± 0% 1.18µs ± 0% +0.80% (p=0.008 n=5+5) unquote[a] 1.33ms ± 1% 1.26ms ± 0% -5.13% (p=0.008 n=5+5) unquote[\u03b1] 952µs ± 2% 911µs ± 1% -4.25% (p=0.008 n=5+5) unquote[\u65e5] 613µs ± 2% 592µs ± 0% -3.48% (p=0.008 n=5+5) unquote[\U0001f64f] 3.62ms ± 1% 3.46ms ± 0% -4.32% (p=0.008 n=5+5) stdunquote 788ns ± 0% 812ns ± 1% +3.07% (p=0.016 n=4+5) --- golang/.gitignore | 1 + golang/_strconv.pxd | 21 +++++ golang/_strconv.pyx | 183 ++++++++++++++++++++++++++++++++++++++++++++ golang/pyx/build.py | 2 + golang/strconv.pxd | 26 +++++++ golang/strconv.py | 166 ++-------------------------------------- setup.py | 3 + 7 files changed, 242 insertions(+), 160 deletions(-) create mode 100644 golang/_strconv.pxd create mode 100644 golang/_strconv.pyx create mode 100644 golang/strconv.pxd diff --git a/golang/.gitignore b/golang/.gitignore index 892afba..7492664 100644 --- a/golang/.gitignore +++ b/golang/.gitignore @@ -9,6 +9,7 @@ /_io.cpp /_os.cpp /_os_test.cpp +/_strconv.cpp /_strings_test.cpp /_sync.cpp /_sync_test.cpp diff --git a/golang/_strconv.pxd b/golang/_strconv.pxd new file mode 100644 index 0000000..5df4aef --- /dev/null +++ b/golang/_strconv.pxd @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# cython: language_level=2 +# Copyright (C) 2018-2023 Nexedi SA and Contributors. +# Kirill Smelkov +# +# This program is free software: you can Use, Study, Modify and Redistribute +# it under the terms of the GNU General Public License version 3, or (at your +# option) any later version, as published by the Free Software Foundation. +# +# You can also Link and Combine this program with other software covered by +# the terms of any of the Free Software licenses or any of the Open Source +# Initiative approved licenses and Convey the resulting work. Corresponding +# source of such a combination shall include the source code for all other +# software used. +# +# This program is distributed WITHOUT ANY WARRANTY; without even the implied +# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# See COPYING file for full licensing terms. +# See https://www.nexedi.com/licensing for rationale and options. +"""Package strconv provides Go-compatible string conversions.""" diff --git a/golang/_strconv.pyx b/golang/_strconv.pyx new file mode 100644 index 0000000..deb2d78 --- /dev/null +++ b/golang/_strconv.pyx @@ -0,0 +1,183 @@ +# -*- coding: utf-8 -*- +# cython: language_level=2 +# Copyright (C) 2018-2023 Nexedi SA and Contributors. +# Kirill Smelkov +# +# This program is free software: you can Use, Study, Modify and Redistribute +# it under the terms of the GNU General Public License version 3, or (at your +# option) any later version, as published by the Free Software Foundation. +# +# You can also Link and Combine this program with other software covered by +# the terms of any of the Free Software licenses or any of the Open Source +# Initiative approved licenses and Convey the resulting work. Corresponding +# source of such a combination shall include the source code for all other +# software used. +# +# This program is distributed WITHOUT ANY WARRANTY; without even the implied +# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# See COPYING file for full licensing terms. +# See https://www.nexedi.com/licensing for rationale and options. +"""_strconv.pyx implements strconv.pyx - see _strconv.pxd for package overview.""" + +from __future__ import print_function, absolute_import + +import unicodedata, codecs +from six.moves import range as xrange + +from golang import b +from golang._golang import _py_utf8_decode_rune as _utf8_decode_rune, _py_rune_error as _rune_error, _xunichr + + +# quote quotes unicode|bytes string into valid "..." bytestring always quoted with ". +def quote(s): # -> bstr + q, _ = _quote(b(s), b'"') + return b(q) + +def _quote(s, quote): # -> (quoted, nonascii_escape) + assert isinstance(s, bytes), type(s) + assert isinstance(quote, bytes), type(quote) + assert len(quote) == 1, repr(quote) + + outv = [] + emit = outv.append + nonascii_escape = False + i = 0 + while i < len(s): + c = s[i:i+1] + # fast path - ASCII only + if ord(c) < 0x80: + if c in (b'\\', quote): + emit(b'\\'+c) + + # printable ASCII + elif b' ' <= c <= b'\x7e': + emit(c) + + # non-printable ASCII + elif c == b'\t': + emit(br'\t') + elif c == b'\n': + emit(br'\n') + elif c == b'\r': + emit(br'\r') + + # everything else is non-printable + else: + emit(br'\x%02x' % ord(c)) + + i += 1 + + # slow path - full UTF-8 decoding + unicodedata + else: + r, size = _utf8_decode_rune(s[i:]) + isize = i + size + + # decode error - just emit raw byte as escaped + if r == _rune_error and size == 1: + nonascii_escape = True + emit(br'\x%02x' % ord(c)) + + # printable utf-8 characters go as is + elif unicodedata.category(_xunichr(r))[0] in _printable_cat0: + emit(s[i:isize]) + + # everything else goes in numeric byte escapes + else: + nonascii_escape = True + for j in xrange(i, isize): + emit(br'\x%02x' % ord(s[j:j+1])) + + i = isize + + return (quote + b''.join(outv) + quote, nonascii_escape) + + +# unquote decodes "-quoted unicode|byte string. +# +# ValueError is raised if there are quoting syntax errors. +def unquote(s): # -> bstr + us, tail = unquote_next(s) + if len(tail) != 0: + raise ValueError('non-empty tail after closing "') + return us + +# unquote_next decodes next "-quoted unicode|byte string. +# +# it returns -> (unquoted(s), tail-after-") +# +# ValueError is raised if there are quoting syntax errors. +def unquote_next(s): # -> (bstr, bstr) + us, tail = _unquote_next(b(s)) + return b(us), b(tail) + +def _unquote_next(s): + assert isinstance(s, bytes) + + if len(s) == 0 or s[0:0+1] != b'"': + raise ValueError('no starting "') + + outv = [] + emit= outv.append + + s = s[1:] + while 1: + r, width = _utf8_decode_rune(s) + if width == 0: + raise ValueError('no closing "') + + if r == ord('"'): + s = s[1:] + break + + # regular UTF-8 character + if r != ord('\\'): + emit(s[:width]) + s = s[width:] + continue + + if len(s) < 2: + raise ValueError('unexpected EOL after \\') + + c = s[1:1+1] + + # \ -> ; c = \ " + if c in b'\\"': + emit(c) + s = s[2:] + continue + + # \t \n \r + uc = None + if c == b't': uc = b'\t' + elif c == b'n': uc = b'\n' + elif c == b'r': uc = b'\r' + # accept also \a \b \v \f that Go might produce + # Python also decodes those escapes even though it does not produce them: + # https://github.com/python/cpython/blob/2.7.18-0-g8d21aa21f2c/Objects/stringobject.c#L677-L688 + elif c == b'a': uc = b'\x07' + elif c == b'b': uc = b'\x08' + elif c == b'v': uc = b'\x0b' + elif c == b'f': uc = b'\x0c' + + if uc is not None: + emit(uc) + s = s[2:] + continue + + # \x?? hex + if c == b'x': # XXX also handle octals? + if len(s) < 2+2: + raise ValueError('unexpected EOL after \\x') + + b = codecs.decode(s[2:2+2], 'hex') + emit(b) + s = s[2+2:] + continue + + raise ValueError('invalid escape \\%s' % chr(ord(c[0:0+1]))) + + return b''.join(outv), s + + +_printable_cat0 = frozenset(['L', 'N', 'P', 'S']) # letters, numbers, punctuation, symbols diff --git a/golang/pyx/build.py b/golang/pyx/build.py index 95e0b17..1cb5f3f 100644 --- a/golang/pyx/build.py +++ b/golang/pyx/build.py @@ -265,6 +265,8 @@ def Extension(name, sources, **kw): '_fmt.pxd', 'io.pxd', '_io.pxd', + 'strconv.pxd', + '_strconv.pxd', 'strings.pxd', 'sync.pxd', '_sync.pxd', diff --git a/golang/strconv.pxd b/golang/strconv.pxd new file mode 100644 index 0000000..dd1d2b9 --- /dev/null +++ b/golang/strconv.pxd @@ -0,0 +1,26 @@ +# cython: language_level=2 +# Copyright (C) 2018-2023 Nexedi SA and Contributors. +# Kirill Smelkov +# +# This program is free software: you can Use, Study, Modify and Redistribute +# it under the terms of the GNU General Public License version 3, or (at your +# option) any later version, as published by the Free Software Foundation. +# +# You can also Link and Combine this program with other software covered by +# the terms of any of the Free Software licenses or any of the Open Source +# Initiative approved licenses and Convey the resulting work. Corresponding +# source of such a combination shall include the source code for all other +# software used. +# +# This program is distributed WITHOUT ANY WARRANTY; without even the implied +# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# See COPYING file for full licensing terms. +# See https://www.nexedi.com/licensing for rationale and options. +"""Package strconv provides Go-compatible string conversions. + +See _strconv.pxd for package documentation. +""" + +# redirect cimport: golang.strconv -> golang._strconv (see __init__.pxd for rationale) +from golang._strconv cimport * diff --git a/golang/strconv.py b/golang/strconv.py index 0408a0c..fec0ac9 100644 --- a/golang/strconv.py +++ b/golang/strconv.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2022 Nexedi SA and Contributors. +# Copyright (C) 2018-2023 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -21,162 +21,8 @@ from __future__ import print_function, absolute_import -import unicodedata, codecs -from six.moves import range as xrange - -from golang import b -from golang._golang import _py_utf8_decode_rune as _utf8_decode_rune, _py_rune_error as _rune_error, _xunichr - - -# quote quotes unicode|bytes string into valid "..." bytestring always quoted with ". -def quote(s): # -> bstr - q, _ = _quote(b(s), b'"') - return b(q) - -def _quote(s, quote): # -> (quoted, nonascii_escape) - assert isinstance(s, bytes), type(s) - assert isinstance(quote, bytes), type(quote) - assert len(quote) == 1, repr(quote) - - outv = [] - emit = outv.append - nonascii_escape = False - i = 0 - while i < len(s): - c = s[i:i+1] - # fast path - ASCII only - if ord(c) < 0x80: - if c in (b'\\', quote): - emit(b'\\'+c) - - # printable ASCII - elif b' ' <= c <= b'\x7e': - emit(c) - - # non-printable ASCII - elif c == b'\t': - emit(br'\t') - elif c == b'\n': - emit(br'\n') - elif c == b'\r': - emit(br'\r') - - # everything else is non-printable - else: - emit(br'\x%02x' % ord(c)) - - i += 1 - - # slow path - full UTF-8 decoding + unicodedata - else: - r, size = _utf8_decode_rune(s[i:]) - isize = i + size - - # decode error - just emit raw byte as escaped - if r == _rune_error and size == 1: - nonascii_escape = True - emit(br'\x%02x' % ord(c)) - - # printable utf-8 characters go as is - elif unicodedata.category(_xunichr(r))[0] in _printable_cat0: - emit(s[i:isize]) - - # everything else goes in numeric byte escapes - else: - nonascii_escape = True - for j in xrange(i, isize): - emit(br'\x%02x' % ord(s[j:j+1])) - - i = isize - - return (quote + b''.join(outv) + quote, nonascii_escape) - - -# unquote decodes "-quoted unicode|byte string. -# -# ValueError is raised if there are quoting syntax errors. -def unquote(s): # -> bstr - us, tail = unquote_next(s) - if len(tail) != 0: - raise ValueError('non-empty tail after closing "') - return us - -# unquote_next decodes next "-quoted unicode|byte string. -# -# it returns -> (unquoted(s), tail-after-") -# -# ValueError is raised if there are quoting syntax errors. -def unquote_next(s): # -> (bstr, bstr) - us, tail = _unquote_next(b(s)) - return b(us), b(tail) - -def _unquote_next(s): - assert isinstance(s, bytes) - - if len(s) == 0 or s[0:0+1] != b'"': - raise ValueError('no starting "') - - outv = [] - emit= outv.append - - s = s[1:] - while 1: - r, width = _utf8_decode_rune(s) - if width == 0: - raise ValueError('no closing "') - - if r == ord('"'): - s = s[1:] - break - - # regular UTF-8 character - if r != ord('\\'): - emit(s[:width]) - s = s[width:] - continue - - if len(s) < 2: - raise ValueError('unexpected EOL after \\') - - c = s[1:1+1] - - # \ -> ; c = \ " - if c in b'\\"': - emit(c) - s = s[2:] - continue - - # \t \n \r - uc = None - if c == b't': uc = b'\t' - elif c == b'n': uc = b'\n' - elif c == b'r': uc = b'\r' - # accept also \a \b \v \f that Go might produce - # Python also decodes those escapes even though it does not produce them: - # https://github.com/python/cpython/blob/2.7.18-0-g8d21aa21f2c/Objects/stringobject.c#L677-L688 - elif c == b'a': uc = b'\x07' - elif c == b'b': uc = b'\x08' - elif c == b'v': uc = b'\x0b' - elif c == b'f': uc = b'\x0c' - - if uc is not None: - emit(uc) - s = s[2:] - continue - - # \x?? hex - if c == b'x': # XXX also handle octals? - if len(s) < 2+2: - raise ValueError('unexpected EOL after \\x') - - b = codecs.decode(s[2:2+2], 'hex') - emit(b) - s = s[2+2:] - continue - - raise ValueError('invalid escape \\%s' % chr(ord(c[0:0+1]))) - - return b''.join(outv), s - - -_printable_cat0 = frozenset(['L', 'N', 'P', 'S']) # letters, numbers, punctuation, symbols +from golang._strconv import \ + quote, \ + _quote, \ + unquote, \ + unquote_next diff --git a/setup.py b/setup.py index bd5148b..f3ef37f 100644 --- a/setup.py +++ b/setup.py @@ -316,6 +316,9 @@ def get_python_libdir(): Ext('golang.os._signal', ['golang/os/_signal.pyx']), + Ext('golang._strconv', + ['golang/_strconv.pyx']), + Ext('golang._strings_test', ['golang/_strings_test.pyx', 'golang/strings_test.cpp']), From 533bd30acf001ffbc3658d6603ef3e0f5b49d3a9 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 26 Jun 2023 22:09:40 +0300 Subject: [PATCH 17/29] golang, strconv: Switch them to cimport each other at pyx level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 50b8cb7e (strconv: Move functionality related to UTF8 encode/decode into _golang_str) both golang_str and strconv import each other. Before this patch that import was done at py level at runtime from outside to workaround the import cycle. This results in that strconv functionality is not available while golang is only being imported. So far it was not a problem, but when builtin string types will become patched with bstr and ustr, that will become a problem because string repr starts to be used at import time, which for pybstr is implemented via strconv.quote . -> Fix this by switching golang and strconv to cimport each other at pyx level. There, similarly to C, the cycle works just ok out of the box. This also automatically helps performance a bit: name old time/op new time/op delta quote[a] 805µs ± 0% 786µs ± 1% -2.40% (p=0.016 n=5+4) quote[\u03b1] 1.21ms ± 0% 1.12ms ± 0% -7.47% (p=0.008 n=5+5) quote[\u65e5] 785µs ± 0% 738µs ± 2% -5.97% (p=0.016 n=5+4) quote[\U0001f64f] 1.04ms ± 0% 0.92ms ± 1% -11.73% (p=0.008 n=5+5) stdquote 1.18µs ± 0% 1.19µs ± 0% +0.54% (p=0.008 n=5+5) unquote[a] 1.26ms ± 0% 1.08ms ± 0% -14.66% (p=0.008 n=5+5) unquote[\u03b1] 911µs ± 1% 797µs ± 0% -12.55% (p=0.008 n=5+5) unquote[\u65e5] 592µs ± 0% 522µs ± 0% -11.81% (p=0.008 n=5+5) unquote[\U0001f64f] 3.46ms ± 0% 3.21ms ± 0% -7.34% (p=0.008 n=5+5) stdunquote 812ns ± 1% 815ns ± 0% ~ (p=0.183 n=5+5) --- golang/__init__.py | 8 -------- golang/_golang.pxd | 9 ++++++++- golang/_golang_str.pyx | 22 +++++++++------------- golang/_strconv.pxd | 5 +++++ golang/_strconv.pyx | 27 ++++++++++++++------------- golang/strconv.py | 7 +++---- 6 files changed, 39 insertions(+), 39 deletions(-) diff --git a/golang/__init__.py b/golang/__init__.py index 49163d0..e773775 100644 --- a/golang/__init__.py +++ b/golang/__init__.py @@ -324,11 +324,3 @@ def _emit_exc_context(exc, emitf, recursef): pyu as u, \ pyustr as ustr, \ pyuchr as uchr - -# import golang.strconv into _golang from here to workaround cyclic golang ↔ strconv dependency -def _(): - from . import _golang - from . import strconv - _golang.pystrconv = strconv -_() -del _ diff --git a/golang/_golang.pxd b/golang/_golang.pxd index 2e24025..664389f 100644 --- a/golang/_golang.pxd +++ b/golang/_golang.pxd @@ -43,7 +43,7 @@ In addition to Cython/nogil API, golang.pyx provides runtime for golang.py: - Python-level channels are represented by pychan + pyselect. - Python-level error is represented by pyerror. - Python-level panic is represented by pypanic. -- Python-level strings are represented by pybstr and pyustr. +- Python-level strings are represented by pybstr/pyustr and pyb/pyu. """ @@ -269,4 +269,11 @@ cdef class pyerror(Exception): cdef object from_error (error err) # -> pyerror | None +# strings +cpdef pyb(s) # -> bstr +cpdef pyu(s) # -> ustr cdef __pystr(object obj) + + +cdef (rune, int) _utf8_decode_rune(const byte[::1] s) +cdef unicode _xunichr(rune i) diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index ba127b6..563e0f9 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -72,7 +72,7 @@ from cython cimport no_gc from libc.stdio cimport FILE -pystrconv = None # = golang.strconv imported at runtime (see __init__.py) +from golang cimport strconv import string as pystring import types as pytypes import functools as pyfunctools @@ -97,7 +97,7 @@ pybstr = _pybstr # initially point to -> _pybstr/_pyustr pyustr = _pyustr # TODO -> cdef for speed -def pyb(s): # -> bstr +cpdef pyb(s): # -> bstr """b converts object to bstr. - For bstr the same object is returned. @@ -118,7 +118,7 @@ def pyb(s): # -> bstr raise TypeError("b: invalid type %s" % type(s)) return bs -def pyu(s): # -> ustr +cpdef pyu(s): # -> ustr """u converts object to ustr. - For ustr the same object is returned. @@ -1068,7 +1068,7 @@ cdef _bpysmartquote_u3b2(s): # -> (unicode(py3)|bytes(py2), nonascii_escape) if (quote in s) and (b'"' not in s): quote = b'"' - x, nonascii_escape = pystrconv._quote(s, quote) # raw bytes + x, nonascii_escape = strconv._quote(s, quote) # raw bytes if PY_MAJOR_VERSION < 3: return x, nonascii_escape else: @@ -1093,7 +1093,7 @@ def pyqq(obj): # py3: str | bytes if not isinstance(obj, (unicode, bytes)): obj = _bstringify(obj) - return pystrconv.quote(obj) + return strconv.pyquote(obj) @@ -1875,16 +1875,12 @@ cdef extern from "Python.h": from six import unichr # py2: unichr py3: chr from six import int2byte as bchr # py2: chr py3: lambda x: bytes((x,)) -_py_rune_error = utf8.RuneError - cdef bint _ucs2_build = (sys.maxunicode == 0xffff) # ucs2 assert _ucs2_build or sys.maxunicode >= 0x0010ffff # or ucs4 # _utf8_decode_rune decodes next UTF8-character from byte string s. # # _utf8_decode_rune(s) -> (r, size) -def _py_utf8_decode_rune(const byte[::1] s): - return _utf8_decode_rune(s) cdef (rune, int) _utf8_decode_rune(const byte[::1] s): if len(s) == 0: return utf8.RuneError, 0 @@ -2029,10 +2025,10 @@ else: # # it works correctly even on ucs2 python builds, where ordinals >= 0x10000 are # represented as 2 unicode points. -if not _ucs2_build: - _xunichr = unichr -else: - def _xunichr(i): +cdef unicode _xunichr(rune i): + if not _ucs2_build: + return unichr(i) + else: if i < 0x10000: return unichr(i) diff --git a/golang/_strconv.pxd b/golang/_strconv.pxd index 5df4aef..69af360 100644 --- a/golang/_strconv.pxd +++ b/golang/_strconv.pxd @@ -19,3 +19,8 @@ # See COPYING file for full licensing terms. # See https://www.nexedi.com/licensing for rationale and options. """Package strconv provides Go-compatible string conversions.""" + +from golang cimport byte + +cpdef pyquote(s) +cdef _quote(s, quote) # -> (quoted, nonascii_escape) diff --git a/golang/_strconv.pyx b/golang/_strconv.pyx index deb2d78..fac6735 100644 --- a/golang/_strconv.pyx +++ b/golang/_strconv.pyx @@ -25,16 +25,17 @@ from __future__ import print_function, absolute_import import unicodedata, codecs from six.moves import range as xrange -from golang import b -from golang._golang import _py_utf8_decode_rune as _utf8_decode_rune, _py_rune_error as _rune_error, _xunichr +from golang cimport pyb +from golang cimport _utf8_decode_rune, _xunichr +from golang.unicode cimport utf8 # quote quotes unicode|bytes string into valid "..." bytestring always quoted with ". -def quote(s): # -> bstr - q, _ = _quote(b(s), b'"') - return b(q) +cpdef pyquote(s): # -> bstr + q, _ = _quote(pyb(s), b'"') + return pyb(q) -def _quote(s, quote): # -> (quoted, nonascii_escape) +cdef _quote(s, quote): # -> (quoted, nonascii_escape) assert isinstance(s, bytes), type(s) assert isinstance(quote, bytes), type(quote) assert len(quote) == 1, repr(quote) @@ -74,7 +75,7 @@ def _quote(s, quote): # -> (quoted, nonascii_escape) isize = i + size # decode error - just emit raw byte as escaped - if r == _rune_error and size == 1: + if r == utf8.RuneError and size == 1: nonascii_escape = True emit(br'\x%02x' % ord(c)) @@ -96,8 +97,8 @@ def _quote(s, quote): # -> (quoted, nonascii_escape) # unquote decodes "-quoted unicode|byte string. # # ValueError is raised if there are quoting syntax errors. -def unquote(s): # -> bstr - us, tail = unquote_next(s) +def pyunquote(s): # -> bstr + us, tail = pyunquote_next(s) if len(tail) != 0: raise ValueError('non-empty tail after closing "') return us @@ -107,11 +108,11 @@ def unquote(s): # -> bstr # it returns -> (unquoted(s), tail-after-") # # ValueError is raised if there are quoting syntax errors. -def unquote_next(s): # -> (bstr, bstr) - us, tail = _unquote_next(b(s)) - return b(us), b(tail) +def pyunquote_next(s): # -> (bstr, bstr) + us, tail = _unquote_next(pyb(s)) + return pyb(us), pyb(tail) -def _unquote_next(s): +cdef _unquote_next(s): assert isinstance(s, bytes) if len(s) == 0 or s[0:0+1] != b'"': diff --git a/golang/strconv.py b/golang/strconv.py index fec0ac9..6cbae96 100644 --- a/golang/strconv.py +++ b/golang/strconv.py @@ -22,7 +22,6 @@ from __future__ import print_function, absolute_import from golang._strconv import \ - quote, \ - _quote, \ - unquote, \ - unquote_next + pyquote as quote, \ + pyunquote as unquote, \ + pyunquote_next as unquote_next From ac751a5623b36481624c79a2947fa22089cbecbf Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 26 Jun 2023 22:43:05 +0300 Subject: [PATCH 18/29] strconv: Optimize quoting lightly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add type annotations and use C-level objects instead of py-ones where it is easy to do. We are not all-good yet, but this already brings some noticable speedup: name old time/op new time/op delta quote[a] 786µs ± 1% 10µs ± 0% -98.76% (p=0.016 n=4+5) quote[\u03b1] 1.12ms ± 0% 0.41ms ± 0% -63.37% (p=0.008 n=5+5) quote[\u65e5] 738µs ± 2% 258µs ± 0% -65.07% (p=0.016 n=4+5) quote[\U0001f64f] 920µs ± 1% 78µs ± 0% -91.46% (p=0.016 n=5+4) stdquote 1.19µs ± 0% 1.19µs ± 0% ~ (p=0.794 n=5+5) unquote[a] 1.08ms ± 0% 1.08ms ± 1% ~ (p=0.548 n=5+5) unquote[\u03b1] 797µs ± 0% 807µs ± 1% +1.23% (p=0.008 n=5+5) unquote[\u65e5] 522µs ± 0% 520µs ± 1% ~ (p=0.056 n=5+5) unquote[\U0001f64f] 3.21ms ± 0% 3.14ms ± 0% -2.13% (p=0.008 n=5+5) stdunquote 815ns ± 0% 836ns ± 0% +2.63% (p=0.008 n=5+5) --- golang/_golang_str.pyx | 16 +++--- golang/_strconv.pxd | 2 +- golang/_strconv.pyx | 115 ++++++++++++++++++++++++++++++----------- 3 files changed, 91 insertions(+), 42 deletions(-) diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index 563e0f9..e6fc8f8 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -1056,19 +1056,15 @@ _bstrustr_remove_unsupported_slots() # # NOTE the return type is str type of current python, so that quoted result # could be directly used in __repr__ or __str__ implementation. -cdef _bpysmartquote_u3b2(s): # -> (unicode(py3)|bytes(py2), nonascii_escape) - # TODO change to `const byte[::1] s` after strconv._quote is moved to pyx - if isinstance(s, bytearray): - s = _bytearray_data(s) - assert isinstance(s, bytes), s - +cdef _bpysmartquote_u3b2(const byte[::1] s): # -> (unicode(py3)|bytes(py2), nonascii_escape) # smartquotes: choose ' or " as quoting character exactly the same way python does # https://github.com/python/cpython/blob/v2.7.18-0-g8d21aa21f2c/Objects/stringobject.c#L905-L909 - quote = b"'" - if (quote in s) and (b'"' not in s): - quote = b'"' + cdef byte quote = ord("'") + if (quote in s) and (ord('"') not in s): + quote = ord('"') - x, nonascii_escape = strconv._quote(s, quote) # raw bytes + cdef bint nonascii_escape + x = strconv._quote(s, quote, &nonascii_escape) # raw bytes if PY_MAJOR_VERSION < 3: return x, nonascii_escape else: diff --git a/golang/_strconv.pxd b/golang/_strconv.pxd index 69af360..0107aad 100644 --- a/golang/_strconv.pxd +++ b/golang/_strconv.pxd @@ -23,4 +23,4 @@ from golang cimport byte cpdef pyquote(s) -cdef _quote(s, quote) # -> (quoted, nonascii_escape) +cdef bytes _quote(const byte[::1] s, char quote, bint* out_nonascii_escape) # -> (quoted, nonascii_escape) diff --git a/golang/_strconv.pyx b/golang/_strconv.pyx index fac6735..8ffd6f5 100644 --- a/golang/_strconv.pyx +++ b/golang/_strconv.pyx @@ -23,49 +23,82 @@ from __future__ import print_function, absolute_import import unicodedata, codecs -from six.moves import range as xrange -from golang cimport pyb +from golang cimport pyb, byte, rune from golang cimport _utf8_decode_rune, _xunichr from golang.unicode cimport utf8 +from cpython cimport PyObject + +cdef extern from "Python.h": + PyObject* PyBytes_FromStringAndSize(char*, Py_ssize_t) except NULL + char* PyBytes_AS_STRING(PyObject*) + int _PyBytes_Resize(PyObject**, Py_ssize_t) except -1 + void Py_DECREF(PyObject*) + # quote quotes unicode|bytes string into valid "..." bytestring always quoted with ". cpdef pyquote(s): # -> bstr - q, _ = _quote(pyb(s), b'"') + cdef bint _ + q = _quote(pyb(s), '"', &_) return pyb(q) -cdef _quote(s, quote): # -> (quoted, nonascii_escape) - assert isinstance(s, bytes), type(s) - assert isinstance(quote, bytes), type(quote) - assert len(quote) == 1, repr(quote) - outv = [] - emit = outv.append - nonascii_escape = False - i = 0 +cdef char[16] hexdigit # = '0123456789abcdef' +for i, c in enumerate('0123456789abcdef'): + hexdigit[i] = ord(c) + + +# XXX not possible to use `except (NULL, False)` +# (https://stackoverflow.com/a/66335433/9456786) +cdef bytes _quote(const byte[::1] s, char quote, bint* out_nonascii_escape): # -> (quoted, nonascii_escape) + # 2*" + max(4)*each byte (+ 1 for tail \0 implicitly by PyBytesObject) + cdef Py_ssize_t qmaxsize = 1 + 4*len(s) + 1 + cdef PyObject* qout = PyBytes_FromStringAndSize(NULL, qmaxsize) + cdef byte* q = PyBytes_AS_STRING(qout) + + cdef bint nonascii_escape = False + cdef Py_ssize_t i = 0, j + cdef Py_ssize_t isize + cdef int size + cdef rune r + cdef byte c + q[0] = quote; q += 1 while i < len(s): - c = s[i:i+1] + c = s[i] # fast path - ASCII only - if ord(c) < 0x80: - if c in (b'\\', quote): - emit(b'\\'+c) + if c < 0x80: + if c in (ord('\\'), quote): + q[0] = ord('\\') + q[1] = c + q += 2 # printable ASCII - elif b' ' <= c <= b'\x7e': - emit(c) + elif 0x20 <= c <= 0x7e: + q[0] = c + q += 1 # non-printable ASCII - elif c == b'\t': - emit(br'\t') - elif c == b'\n': - emit(br'\n') - elif c == b'\r': - emit(br'\r') + elif c == ord('\t'): + q[0] = ord('\\') + q[1] = ord('t') + q += 2 + elif c == ord('\n'): + q[0] = ord('\\') + q[1] = ord('n') + q += 2 + elif c == ord('\r'): + q[0] = ord('\\') + q[1] = ord('r') + q += 2 # everything else is non-printable else: - emit(br'\x%02x' % ord(c)) + q[0] = ord('\\') + q[1] = ord('x') + q[2] = hexdigit[c >> 4] + q[3] = hexdigit[c & 0xf] + q += 4 i += 1 @@ -77,21 +110,41 @@ cdef _quote(s, quote): # -> (quoted, nonascii_escape) # decode error - just emit raw byte as escaped if r == utf8.RuneError and size == 1: nonascii_escape = True - emit(br'\x%02x' % ord(c)) + q[0] = ord('\\') + q[1] = ord('x') + q[2] = hexdigit[c >> 4] + q[3] = hexdigit[c & 0xf] + q += 4 # printable utf-8 characters go as is - elif unicodedata.category(_xunichr(r))[0] in _printable_cat0: - emit(s[i:isize]) + elif _unicodedata_category(_xunichr(r))[0] in 'LNPS': # letters, numbers, punctuation, symbols + for j in range(i, isize): + q[0] = s[j] + q += 1 # everything else goes in numeric byte escapes else: nonascii_escape = True - for j in xrange(i, isize): - emit(br'\x%02x' % ord(s[j:j+1])) + for j in range(i, isize): + c = s[j] + q[0] = ord('\\') + q[1] = ord('x') + q[2] = hexdigit[c >> 4] + q[3] = hexdigit[c & 0xf] + q += 4 i = isize - return (quote + b''.join(outv) + quote, nonascii_escape) + q[0] = quote; q += 1 + q[0] = 0; # don't q++ at last because size does not include tail \0 + cdef Py_ssize_t qsize = (q - PyBytes_AS_STRING(qout)) + assert qsize <= qmaxsize + _PyBytes_Resize(&qout, qsize) + + bqout = qout + Py_DECREF(qout) + out_nonascii_escape[0] = nonascii_escape + return bqout # unquote decodes "-quoted unicode|byte string. @@ -181,4 +234,4 @@ cdef _unquote_next(s): return b''.join(outv), s -_printable_cat0 = frozenset(['L', 'N', 'P', 'S']) # letters, numbers, punctuation, symbols +cdef _unicodedata_category = unicodedata.category From 17dbfbac88a1adba8da0c733e24b2a8317139468 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Thu, 5 Oct 2023 11:22:19 +0300 Subject: [PATCH 19/29] X My draft state of x/gpystr work; py2/py3 pickle problem should be essentially solved --- .gitmodules | 6 + 3rdparty/capstone | 1 + 3rdparty/funchook | 1 + MANIFEST.in | 10 +- README.rst | 17 +- conftest.py | 10 + golang/_golang.pyx | 4 +- golang/_golang_str.pyx | 550 +++++++++-- golang/_golang_str_pickle.S | 371 ++++++++ golang/_golang_str_pickle.pyx | 1325 +++++++++++++++++++++++++++ golang/_golang_str_pickle_test.pyx | 181 ++++ golang/_strconv.pyx | 11 +- golang/fmt.h | 2 +- golang/golang_str_pickle_test.py | 512 +++++++++++ golang/golang_str_test.py | 379 +++++--- golang/libgolang.h | 11 +- golang/os.cpp | 4 +- golang/os.h | 2 +- golang/os/signal.cpp | 6 +- golang/pyx/build.py | 55 +- golang/runtime.cpp | 69 ++ golang/runtime.h | 50 + golang/runtime/internal/atomic.cpp | 4 +- golang/runtime/internal/syscall.cpp | 14 +- golang/runtime/internal/syscall.h | 4 +- golang/runtime/libgolang.cpp | 2 +- golang/runtime/platform.h | 65 ++ gpython/.gitignore | 1 + gpython/__init__.py | 56 +- gpython/_gpython.pyx | 31 + gpython/_gpython_c.cpp | 76 ++ gpython/gpython_test.py | 76 +- pyproject.toml | 2 +- setup.py | 254 ++++- 34 files changed, 3858 insertions(+), 304 deletions(-) create mode 100644 .gitmodules create mode 160000 3rdparty/capstone create mode 160000 3rdparty/funchook create mode 100644 conftest.py create mode 100644 golang/_golang_str_pickle.S create mode 100644 golang/_golang_str_pickle.pyx create mode 100644 golang/_golang_str_pickle_test.pyx create mode 100644 golang/golang_str_pickle_test.py create mode 100644 golang/runtime.cpp create mode 100644 golang/runtime.h create mode 100644 golang/runtime/platform.h create mode 100644 gpython/.gitignore create mode 100644 gpython/_gpython.pyx create mode 100644 gpython/_gpython_c.cpp diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..c279e31 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "3rdparty/funchook"] + path = 3rdparty/funchook + url = https://github.com/kubo/funchook.git +[submodule "3rdparty/capstone"] + path = 3rdparty/capstone + url = https://github.com/capstone-engine/capstone.git diff --git a/3rdparty/capstone b/3rdparty/capstone new file mode 160000 index 0000000..097c04d --- /dev/null +++ b/3rdparty/capstone @@ -0,0 +1 @@ +Subproject commit 097c04d9413c59a58b00d4d1c8d5dc0ac158ffaa diff --git a/3rdparty/funchook b/3rdparty/funchook new file mode 160000 index 0000000..88388db --- /dev/null +++ b/3rdparty/funchook @@ -0,0 +1 @@ +Subproject commit 88388db3c69e16c1560fee65c6857d75f5ce6fd5 diff --git a/MANIFEST.in b/MANIFEST.in index d5bebb2..e2cae70 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,6 +2,9 @@ include COPYING README.rst CHANGELOG.rst tox.ini pyproject.toml trun .nxdtest include golang/libgolang.h include golang/runtime/libgolang.cpp include golang/runtime/libpyxruntime.cpp +include golang/runtime/platform.h +include golang/runtime.h +include golang/runtime.cpp include golang/pyx/runtime.h include golang/pyx/testprog/golang_dso_user/dsouser/dso.h include golang/pyx/testprog/golang_dso_user/dsouser/dso.cpp @@ -36,7 +39,10 @@ include golang/time.cpp include golang/_testing.h include golang/_compat/windows/strings.h include golang/_compat/windows/unistd.h +include gpython/_gpython_c.cpp recursive-include golang *.py *.pxd *.pyx *.toml *.txt* -recursive-include gpython *.py -recursive-include 3rdparty *.h +recursive-include gpython *.py *.pyx +recursive-include 3rdparty *.h *.c *.cpp *.S *.py *.cmake *.cs *.java +recursive-include 3rdparty LICENSE README.md README COPYING Makefile CMakeLists.txt recursive-exclude golang *_dsoinfo.py +include conftest.py diff --git a/README.rst b/README.rst index 06ec430..80846e7 100644 --- a/README.rst +++ b/README.rst @@ -4,7 +4,7 @@ Package `golang` provides Go-like features for Python: -- `gpython` is Python interpreter with support for lightweight threads. +- `gpython` is Python interpreter with support for lightweight threads and uniform UTF8-based approach to strings. - `go` spawns lightweight thread. - `chan` and `select` provide channels with Go semantic. - `func` allows to define methods separate from class. @@ -46,15 +46,16 @@ __ http://libuv.org/ __ http://software.schmorp.de/pkg/libev.html -Additionally GPython sets UTF-8 to be default encoding always, and puts `go`, -`chan`, `select` etc into builtin namespace. +Additionally GPython sets UTF-8 to be default encoding always, puts `go`, +`chan`, `select` etc into builtin namespace, and makes `bstr`/`ustr` to be used +instead of builtin string types. .. note:: GPython is optional and the rest of Pygolang can be used from under standard Python too. However without gevent integration `go` spawns full - not lightweight - OS thread. - GPython can be also used with threads - not gevent - runtime. Please see - `GPython options`_ for details. + GPython can be also used with threads - not gevent - runtime and with builtin string types. + Please see `GPython options`_ for details. Goroutines and channels @@ -571,3 +572,9 @@ GPython-specific options and environment variables are listed below: coroutines, while with `threads` `go` spawns full OS thread. `gevent` is default. The runtime to use can be also specified via `$GPYTHON_RUNTIME` environment variable. + +`-X gpython.strings=(bstr+ustr|pystd)` + Specify which string types GPython should use. `bstr+ustr` provide + uniform UTF8-based approach to strings, while `pystd` selects regular + `str` and `unicode`. `bstr+ustr` is default. String types to use can be + also specified via `$GPYTHON_STRINGS` environment variable. diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..1ca5c1b --- /dev/null +++ b/conftest.py @@ -0,0 +1,10 @@ +# ignore tests in distorm - else it breaks as e.g. +# +# 3rdparty/funchook/distorm/python/test_distorm3.py:15: in +# import distorm3 +# 3rdparty/funchook/distorm/python/distorm3/__init__.py:57: in +# _distorm = _load_distorm() +# 3rdparty/funchook/distorm/python/distorm3/__init__.py:55: in _load_distorm +# raise ImportError("Error loading the diStorm dynamic library (or cannot load library into process).") +# E ImportError: Error loading the diStorm dynamic library (or cannot load library into process). +collect_ignore = ["3rdparty"] diff --git a/golang/_golang.pyx b/golang/_golang.pyx index 24f7f23..b857197 100644 --- a/golang/_golang.pyx +++ b/golang/_golang.pyx @@ -3,7 +3,7 @@ # cython: binding=False # cython: c_string_type=str, c_string_encoding=utf8 # distutils: language = c++ -# distutils: depends = libgolang.h os/signal.h unicode/utf8.h _golang_str.pyx +# distutils: depends = libgolang.h os/signal.h unicode/utf8.h _golang_str.pyx _golang_str_pickle.pyx # # Copyright (C) 2018-2023 Nexedi SA and Contributors. # Kirill Smelkov @@ -34,7 +34,7 @@ from __future__ import print_function, absolute_import _init_libgolang() _init_libpyxruntime() -from cpython cimport PyObject, Py_INCREF, Py_DECREF, PY_MAJOR_VERSION +from cpython cimport PyObject, Py_INCREF, Py_DECREF, Py_CLEAR, PY_MAJOR_VERSION ctypedef PyObject *pPyObject # https://github.com/cython/cython/issues/534 cdef extern from "Python.h": ctypedef struct PyTupleObject: diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index e6fc8f8..d7282a0 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -22,6 +22,8 @@ It is included from _golang.pyx . """ +from libc.stdio cimport fprintf, stderr # XXX kill + from golang.unicode cimport utf8 from cpython cimport PyUnicode_AsUnicode, PyUnicode_GetSize, PyUnicode_FromUnicode @@ -31,11 +33,13 @@ from cpython cimport Py_EQ, Py_NE, Py_LT, Py_GT, Py_LE, Py_GE from cpython.iterobject cimport PySeqIter_New from cpython cimport PyThreadState_GetDict, PyDict_SetItem from cpython cimport PyObject_CheckBuffer +from cpython cimport Py_TPFLAGS_HAVE_GC, Py_TPFLAGS_HEAPTYPE, Py_TPFLAGS_READY, PyType_Ready +from cpython cimport PyBytes_Format, PyUnicode_Format, PyObject_Str cdef extern from "Python.h": PyTypeObject PyBytes_Type ctypedef struct PyBytesObject: - pass + char *ob_sval cdef extern from "Python.h": PyTypeObject PyUnicode_Type @@ -60,13 +64,40 @@ cdef extern from "Python.h": ctypedef struct _XPyTypeObject "PyTypeObject": PyObject* tp_new(PyTypeObject*, PyObject*, PyObject*) except NULL initproc tp_init + + Py_ssize_t tp_vectorcall_offset + Py_ssize_t tp_weaklistoffset + + PyObject *tp_bases + PyObject *tp_mro + PyObject *tp_cache + PyObject *tp_weaklist + PyObject *tp_subclasses + PySequenceMethods *tp_as_sequence + PyMethodDef *tp_methods + PyMemberDef *tp_members ctypedef struct PySequenceMethods: binaryfunc sq_concat binaryfunc sq_inplace_concat object (*sq_slice) (object, Py_ssize_t, Py_ssize_t) # present only on py2 +cdef extern from "Python.h": + ctypedef struct PyVarObject: + Py_ssize_t ob_size + + +cdef extern from "funchook.h" nogil: + ctypedef struct funchook_t + funchook_t* funchook_create() + int funchook_prepare(funchook_t* h, void** target_func, void* hook_func) + int funchook_install(funchook_t* h, int flags) + int funchook_uninstall(funchook_t* h, int flags) + int funchook_destroy(funchook_t*) + const char* funchook_error_message(const funchook_t*) + int funchook_set_debug_file(const char* name) + from cython cimport no_gc @@ -77,10 +108,6 @@ import string as pystring import types as pytypes import functools as pyfunctools import re as pyre -if PY_MAJOR_VERSION >= 3: - import copyreg as pycopyreg -else: - import copy_reg as pycopyreg # zbytes/zunicode point to original std bytes/unicode types even if they will be patched. @@ -250,6 +277,8 @@ cdef __pystr(object obj): # -> ~str return pyb(obj) +# XXX -> bchr ? (not good as "character" means "unicode character") +# -> bstr.chr ? def pybbyte(int i): # -> 1-byte bstr """bbyte(i) returns 1-byte bstr with ordinal i.""" return pyb(bytearray([i])) @@ -259,6 +288,22 @@ def pyuchr(int i): # -> 1-character ustr return pyu(unichr(i)) +# XXX due to "cannot `cdef class` with __new__" (https://github.com/cython/cython/issues/799) XXX review text +# _pybstr.__new__ is hand-made in _pybstr_tp_new which invokes ↓ ._pybstr__new__() . +# we keep it out of class instead of cdef @staticmethod due to https://github.com/cython/cython/issues/5337 +# XXX def instead of cdef due to ""Non-trivial keyword arguments and starred arguments not allowed in cdef functions +def _pybstr__new__(cls, object='', encoding=None, errors=None): + # encoding or errors -> object must expose buffer interface + if not (encoding is None and errors is None): + object = _buffer_decode(object, encoding, errors) + + # _bstringify. Note: it handles bstr/ustr / unicode/bytes/bytearray as documented + object = _bstringify(object) + assert isinstance(object, (unicode, bytes)), object + bobj = _pyb(cls, object) + assert bobj is not None + return bobj + @no_gc # note setup.py assist this to compile despite cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 """bstr is byte-string. @@ -293,34 +338,26 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 """ # XXX due to "cannot `cdef class` with __new__" (https://github.com/cython/cython/issues/799) - # _pybstr.__new__ is hand-made in _pybstr_tp_new which invokes ↓ .____new__() . - @staticmethod - def ____new__(cls, object='', encoding=None, errors=None): - # encoding or errors -> object must expose buffer interface - if not (encoding is None and errors is None): - object = _buffer_decode(object, encoding, errors) + # _pybstr.__new__ is hand-made in _pybstr_tp_new which invokes ↑ _pybstr__new__() . - # _bstringify. Note: it handles bstr/ustr / unicode/bytes/bytearray as documented - object = _bstringify(object) - assert isinstance(object, (unicode, bytes)), object - bobj = _pyb(cls, object) - assert bobj is not None - return bobj - - def __bytes__(self): return self + def __bytes__(self): return pyb(self) # see __str__ def __unicode__(self): return pyu(self) def __str__(self): if PY_MAJOR_VERSION >= 3: return pyu(self) else: - return self + return pyb(self) # self or pybstr if it was subclass def __repr__(self): qself, nonascii_escape = _bpysmartquote_u3b2(self) bs = _inbstringify_get() if bs.inbstringify == 0 or bs.inrepr: + if pybstr is bytes: # don't wrap with b(...) when bstr replaces builtin str + if PY_MAJOR_VERSION >= 3: + qself = 'b' + qself + return qself if nonascii_escape: # so that e.g. b(u'\x80') is represented as qself = 'b' + qself # b(b'\xc2\x80'), not as b('\xc2\x80') return "b(" + qself + ")" @@ -328,18 +365,8 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 # [b('β')] goes as ['β'] when under _bstringify for %s return qself - - # override reduce for protocols < 2. Builtin handler for that goes through - # copyreg._reduce_ex which eventually calls bytes(bstr-instance) to - # retrieve state, which gives bstr, not bytes. Fix state to be bytes ourselves. def __reduce_ex__(self, protocol): - if protocol >= 2: - return zbytes.__reduce_ex__(self, protocol) - return ( - pycopyreg._reconstructor, - (self.__class__, self.__class__, _bdata(self)) - ) - + return _bstr__reduce_ex__(self, protocol) def __hash__(self): # hash of the same unicode and UTF-8 encoded bytes is generally different @@ -381,6 +408,7 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 else: return pyb(x) + # XXX temp disabled # __iter__ - yields unicode characters def __iter__(self): # TODO iterate without converting self to u @@ -575,7 +603,7 @@ cdef PyObject* _pybstr_tp_new(PyTypeObject* _cls, PyObject* _argv, PyObject* _kw if _kw != NULL: kw = _kw - cdef object x = _pybstr.____new__(_cls, *argv, **kw) + cdef object x = _pybstr__new__(_cls, *argv, **kw) Py_INCREF(x) return x (<_XPyTypeObject*>_pybstr).tp_new = &_pybstr_tp_new @@ -592,6 +620,18 @@ cdef PyObject* _pybstr_tp_new(PyTypeObject* _cls, PyObject* _argv, PyObject* _kw # and bytes are completely the same. assert sizeof(_pybstr) == sizeof(PyBytesObject) +# XXX text +def _pyustr__new__(cls, object='', encoding=None, errors=None): + # encoding or errors -> object must expose buffer interface + if not (encoding is None and errors is None): + object = _buffer_decode(object, encoding, errors) + + # _bstringify. Note: it handles bstr/ustr / unicode/bytes/bytearray as documented + object = _bstringify(object) + assert isinstance(object, (unicode, bytes)), object + uobj = _pyu(cls, object) + assert uobj is not None + return uobj @no_gc cdef class _pyustr(unicode): @@ -622,27 +662,15 @@ cdef class _pyustr(unicode): """ # XXX due to "cannot `cdef class` with __new__" (https://github.com/cython/cython/issues/799) - # _pyustr.__new__ is hand-made in _pyustr_tp_new which invokes ↓ .____new__() . - @staticmethod - def ____new__(cls, object='', encoding=None, errors=None): - # encoding or errors -> object must expose buffer interface - if not (encoding is None and errors is None): - object = _buffer_decode(object, encoding, errors) - - # _bstringify. Note: it handles bstr/ustr / unicode/bytes/bytearray as documented - object = _bstringify(object) - assert isinstance(object, (unicode, bytes)), object - uobj = _pyu(cls, object) - assert uobj is not None - return uobj + # _pyustr.__new__ is hand-made in _pyustr_tp_new which invokes ↑ _pyustr__new__() . def __bytes__(self): return pyb(self) - def __unicode__(self): return self + def __unicode__(self): return pyu(self) # see __str__ def __str__(self): if PY_MAJOR_VERSION >= 3: - return self + return pyu(self) # = self or pyustr if it was subclass else: return pyb(self) @@ -650,6 +678,11 @@ cdef class _pyustr(unicode): qself, nonascii_escape = _upysmartquote_u3b2(self) bs = _inbstringify_get() if bs.inbstringify == 0 or bs.inrepr: + if pyustr is unicode: # don't wrap with u(...) when ustr replaces builtin str/unicode + if not nonascii_escape: # but only if the string is valid utf-8 + if PY_MAJOR_VERSION < 3: + qself = 'u'+qself + return qself if nonascii_escape: qself = 'b'+qself # see bstr.__repr__ return "u(" + qself + ")" @@ -657,18 +690,8 @@ cdef class _pyustr(unicode): # [u('β')] goes as ['β'] when under _bstringify for %s return qself - - # override reduce for protocols < 2. Builtin handler for that goes through - # copyreg._reduce_ex which eventually calls unicode(ustr-instance) to - # retrieve state, which gives ustr, not unicode. Fix state to be unicode ourselves. def __reduce_ex__(self, protocol): - if protocol >= 2: - return zunicode.__reduce_ex__(self, protocol) - return ( - pycopyreg._reconstructor, - (self.__class__, self.__class__, _udata(self)) - ) - + return _ustr__reduce_ex__(self, protocol) def __hash__(self): # see _pybstr.__hash__ for why we stick to hash of current str @@ -718,7 +741,7 @@ cdef class _pyustr(unicode): # https://cython.readthedocs.io/en/latest/src/userguide/migrating_to_cy30.html#arithmetic-special-methods # see also https://github.com/cython/cython/issues/4750 if type(a) is not pyustr: - assert type(b) is pyustr + assert type(b) is pyustr, type(b) return b.__radd__(a) return pyu(zunicode.__add__(a, _pyu_coerce(b))) @@ -738,7 +761,7 @@ cdef class _pyustr(unicode): # __mul__, __rmul__ (no need to override __imul__) def __mul__(a, b): if type(a) is not pyustr: - assert type(b) is pyustr + assert type(b) is pyustr, type(b) return b.__rmul__(a) return pyu(zunicode.__mul__(a, b)) def __rmul__(b, a): @@ -939,7 +962,7 @@ cdef PyObject* _pyustr_tp_new(PyTypeObject* _cls, PyObject* _argv, PyObject* _kw if _kw != NULL: kw = _kw - cdef object x = _pyustr.____new__(_cls, *argv, **kw) + cdef object x = _pyustr__new__(_cls, *argv, **kw) Py_INCREF(x) return x (<_XPyTypeObject*>_pyustr).tp_new = &_pyustr_tp_new @@ -963,9 +986,10 @@ cdef class _pyustrIter: # _bdata/_udata retrieve raw data from bytes/unicode. def _bdata(obj): # -> bytes assert isinstance(obj, bytes) - _ = obj.__getnewargs__()[0] # (`bytes-data`,) - assert type(_) is bytes - return _ + if type(obj) is not bytes: + obj = obj.__getnewargs__()[0] # (`bytes-data`,) + assert type(obj) is bytes + return obj """ bcopy = bytes(memoryview(obj)) assert type(bcopy) is bytes @@ -973,9 +997,10 @@ def _bdata(obj): # -> bytes """ def _udata(obj): # -> unicode assert isinstance(obj, unicode) - _ = obj.__getnewargs__()[0] # (`unicode-data`,) - assert type(_) is unicode - return _ + if type(obj) is not unicode: + obj = obj.__getnewargs__()[0] # (`unicode-data`,) + assert type(obj) is unicode + return obj """ cdef Py_UNICODE* u = PyUnicode_AsUnicode(obj) cdef Py_ssize_t size = PyUnicode_GetSize(obj) @@ -1027,6 +1052,22 @@ if PY2: # ---- adjust bstr/ustr classes after what cython generated ---- +# for pybstr/pyustr cython generates .tp_dealloc that refer to bytes/unicode types directly. +# override that to refer to zbytes/zunicode to avoid infinite recursion on free. +cdef void _pybstr_tp_dealloc(PyObject *self): (zbytes) .tp_dealloc(self) +cdef void _pyustr_tp_dealloc(PyObject *self): (zunicode) .tp_dealloc(self) +(pybstr).tp_dealloc = &_pybstr_tp_dealloc +(pyustr).tp_dealloc = &_pyustr_tp_dealloc + +# change names of bstr/ustr to be e.g. "golang.bstr" instead of "golang._golang._bstr" XXX adjust after .name=str +# this makes sure that unpickling saved bstr does not load via unpatched origin +# class, and is also generally good for saving pickle size and for reducing _golang exposure. +# XXX -> _golang_str_pickle.pyx ? +(pybstr).tp_name = "golang.bstr" +(pyustr).tp_name = "golang.ustr" +assert pybstr.__module__ == "golang"; assert pybstr.__name__ == "bstr" +assert pyustr.__module__ == "golang"; assert pyustr.__name__ == "ustr" + # remove unsupported bstr/ustr methods. do it outside of `cdef class` to # workaround https://github.com/cython/cython/issues/4556 (`if ...` during # `cdef class` is silently handled wrongly) @@ -1039,12 +1080,11 @@ cdef _bstrustr_remove_unsupported_slots(): 'removesuffix', # py3.9 TODO provide fallback implementation ) for slot in vslot: - if not hasattr(unicode, slot): - _patch_slot(pybstr, slot, DEL) - try: + if not hasattr(zunicode, slot): + if hasattr(pybstr, slot): # we might have already removed it on previous call + _patch_slot(pybstr, slot, DEL) + if hasattr(pyustr, slot): # e.g. we do not define ustr.isprintable ourselves _patch_slot(pyustr, slot, DEL) - except KeyError: # e.g. we do not define ustr.isprintable ourselves - pass _bstrustr_remove_unsupported_slots() @@ -1105,7 +1145,7 @@ cdef _bstringify(object obj): # -> unicode|bytes _bstringify_enter() try: - if PY_MAJOR_VERSION >= 3: + if False: # PY_MAJOR_VERSION >= 3: # NOTE this depends on patches to bytes.{__repr__,__str__} below return unicode(obj) @@ -1118,10 +1158,12 @@ cdef _bstringify(object obj): # -> unicode|bytes # # NOTE this depends on patches to bytes.{__repr__,__str__} and # unicode.{__repr__,__str__} below. - if hasattr(obj, '__unicode__'): - return obj.__unicode__() - elif hasattr(obj, '__str__'): - return obj.__str__() + if False: # PY_MAJOR_VERSION < 3 and hasattr(obj, '__unicode__'): + return obj.__unicode__() # XXX needed ? + elif Py_TYPE(obj).tp_str != NULL: + return Py_TYPE(obj).tp_str(obj) + #elif hasattr(obj, '__str__'): + # return obj.__str__() else: return repr(obj) @@ -1422,19 +1464,24 @@ cdef _InBStringify _inbstringify_get(): return ts_inbstringify +# XXX text +cdef _get_slot(PyTypeObject* typ, str name): + typdict = (typ.tp_dict) + return typdict[name] + # _patch_slot installs func_or_descr into typ's __dict__ as name. # -# if func_or_descr is descriptor (has __get__), it is installed as is. +# if func_or_descr is descriptor (has __get__), or asis=True, it is installed as is. # otherwise it is wrapped with "unbound method" descriptor. # # if func_or_descr is DEL the slot is removed from typ's __dict__. cdef DEL = object() -cdef _patch_slot(PyTypeObject* typ, str name, object func_or_descr): +cdef _patch_slot(PyTypeObject* typ, str name, object func_or_descr, asis=False): typdict = (typ.tp_dict) #print("\npatching %s.%s with %r" % (typ.tp_name, name, func_or_descr)) #print("old: %r" % typdict.get(name)) - if hasattr(func_or_descr, '__get__') or func_or_descr is DEL: + if hasattr(func_or_descr, '__get__') or func_or_descr is DEL or asis: descr = func_or_descr else: func = func_or_descr @@ -1498,7 +1545,7 @@ cdef object _atidx_re = pyre.compile('.* at index ([0-9]+)$') cdef _bprintf(const byte[::1] fmt, xarg): # -> pybstr cdef bytearray out = bytearray() - cdef tuple argv = None # if xarg is tuple + cdef object argv = None # if xarg is tuple or subclass cdef object argm = None # if xarg is mapping # https://github.com/python/cpython/blob/2.7-0-g8d21aa21f2c/Objects/stringobject.c#L4298-L4300 @@ -1704,7 +1751,11 @@ cdef _bprintf(const byte[::1] fmt, xarg): # -> pybstr #print('--> __mod__ ', repr(fmt1), ' % ', repr(arg)) try: - s = zbytes.__mod__(fmt1, arg) + IF PY2: + # NOTE not zbytes.__mod__ because underlying PyBytes_Format is patched + s = _pbytes_Format(fmt1, arg) + ELSE: + s = zbytes.__mod__(fmt1, arg) except ValueError as e: # adjust position in '... at index ' from fmt1 to fmt if len(e.args) == 1: @@ -1795,6 +1846,50 @@ class _BFormatter(pystring.Formatter): return super(_BFormatter, self).get_field(field_name, args, kwargs) +# XXX place, comments +# str % ... : ceval on py2 and py3 < 3.11 invokes PyString_Format / PyUnicode_Format +# directly upon seeing BINARY_MODULO. This leads to bstr.__mod__ not being called. +ctypedef unicode uformatfunc(object, object) +ctypedef bytes bformatfunc(object, object) +cdef uformatfunc* _punicode_Format = PyUnicode_Format +cdef unicode _unicode_xFormat(object s, object args): + return pyustr.__mod__(s, args) + +IF PY2: + cdef bformatfunc* _pbytes_Format = PyBytes_Format + cdef _bytes_xFormat(object s, object args): + return pybstr.__mod__(s, args) + +cdef _patch_capi_str_format(): + cpatch(&_punicode_Format, _unicode_xFormat) + IF PY2: + cpatch(&_pbytes_Format, _bytes_xFormat) + + +# XXX place, comments, test +#py3.11: specializes instructions. e.g. ustr(obj) will specialize (after +# executing 8 times) to directly invoke +# +# PyObject_Str(obj) +# +# which, if obj is e.g. b'123' will return "b'123'" instead of "123". +# +# -> if we patch str=ustr, we need to patch PyObject_Str as well. +# -> XXX and check all other specializations. +# +# NOTE also good to just do +cdef _object_xStr(object s): + IF PY2: + return pybstr(s) + ELSE: + return pyustr(s) +ctypedef object objstrfunc(object) +cdef objstrfunc* _pobject_Str = PyObject_Str +cdef _patch_capi_object_str(): + cpatch(&_pobject_Str, _object_xStr) + + + # ---- misc ---- cdef object _xpyu_coerce(obj): @@ -1871,6 +1966,7 @@ cdef extern from "Python.h": from six import unichr # py2: unichr py3: chr from six import int2byte as bchr # py2: chr py3: lambda x: bytes((x,)) +# XXX turn vvv into compile-time constant cdef bint _ucs2_build = (sys.maxunicode == 0xffff) # ucs2 assert _ucs2_build or sys.maxunicode >= 0x0010ffff # or ucs4 @@ -1910,7 +2006,7 @@ cdef (rune, int) _utf8_decode_rune(const byte[::1] s): # _utf8_decode_surrogateescape mimics s.decode('utf-8', 'surrogateescape') from py3. -def _utf8_decode_surrogateescape(const byte[::1] s): # -> unicode +cdef _utf8_decode_surrogateescape(const byte[::1] s): # -> unicode if PY_MAJOR_VERSION >= 3: if len(s) == 0: return u'' # avoid out-of-bounds slice access on &s[0] @@ -1950,7 +2046,7 @@ def _utf8_decode_surrogateescape(const byte[::1] s): # -> unicode # _utf8_encode_surrogateescape mimics s.encode('utf-8', 'surrogateescape') from py3. -def _utf8_encode_surrogateescape(s): # -> bytes +cdef _utf8_encode_surrogateescape(s): # -> bytes assert isinstance(s, unicode) if PY_MAJOR_VERSION >= 3: return zunicode.encode(s, 'UTF-8', 'surrogateescape') @@ -2032,3 +2128,289 @@ cdef unicode _xunichr(rune i): uh = i - 0x10000 return unichr(0xd800 + (uh >> 10)) + \ unichr(0xdc00 + (uh & 0x3ff)) + + +# ---- funchook wrappers ----- + +cdef funchook_t* xfunchook_create() except NULL: + h = funchook_create() + if h == NULL: + raise MemoryError() + return h + +cdef xfunchook_destroy(funchook_t* h): + err = funchook_destroy(h) + if err != 0: + raise RuntimeError(funchook_error_message(h)) + +cdef xfunchook_prepare(funchook_t* h, void** target_func, void* hook_func): + err = funchook_prepare(h, target_func, hook_func) + if err != 0: + raise RuntimeError(funchook_error_message(h)) + +cdef xfunchook_install(funchook_t* h, int flags): + err = funchook_install(h, flags) + if err != 0: + raise RuntimeError(funchook_error_message(h)) + +cdef xfunchook_uninstall(funchook_t* h, int flags): + err = funchook_uninstall(h, flags) + if err != 0: + raise RuntimeError(funchook_error_message(h)) + +# cpatch = xfunchook_prepare on _patch_capi_hook +cdef cpatch(void** target_func, void* hook_func): + assert target_func[0] != NULL + xfunchook_prepare(_patch_capi_hook, target_func, hook_func) + + +# ---- patch unicode/str types to be ustr/bstr under gpython ---- +# XXX make sure original _pybstr/_pyustr cannot be used after patching XXX right ? +# XXX and make sure golang._golang._pybstr cannot be imported as well (ex pickle) +# XXX ._pyustr.__module__ = 'builtins' after patch - why? + +def _(): + gpy_strings = getattr(sys, '_gpy_strings', None) + if gpy_strings == 'bstr+ustr': + _patch_str() + elif gpy_strings in ('pystd', None): + pass + else: + raise AssertionError("invalid sys._gpy_strings: %r" % (gpy_strings,)) +_() + +# _patch_str is invoked when gpython imports golang and instructs to replace +# builtin str/unicode types with bstr/ustr. +# +# After the patch is applied all existing objects that have e.g. unicode type +# will switch to having ustr type. +cdef PyTypeObject _unicode_orig +cdef PyTypeObject _bytes_orig +cdef funchook_t* _patch_capi_hook +cdef _patch_str(): + global zbytes, _bytes_orig, pybstr + global zunicode, _unicode_orig, pyustr + global _patch_capi_hook + + #print('\n\nPATCH\n\n') + + # XXX explain + bpreserve_slots = upreserve_slots = ("maketrans",) + if PY_MAJOR_VERSION < 3: + bpreserve_slots += ("encode",) # @property'ies + upreserve_slots += ("decode",) + + # patch unicode to be pyustr. This patches + # - unicode (py2) + # - str (py3) + _pytype_clone(unicode, &_unicode_orig, "unicode(pystd)") + Py_INCREF(unicode) # XXX needed? + zunicode = &_unicode_orig + + _pytype_replace_by_child( + unicode, &_unicode_orig, + pyustr, "ustr(origin)", + upreserve_slots) + pyustr = unicode # retarget pyustr -> unicode to where it was copied + # XXX vvv needed so that patched unicode could be saved by py2:cPickle at all + (pyustr).tp_name = ("unicode" if PY_MAJOR_VERSION < 3 else "str") + + # py2: patch str to be pybstr + if PY_MAJOR_VERSION < 3: + _pytype_clone(bytes, &_bytes_orig, "bytes(pystd)") + Py_INCREF(bytes) # XXX needed? + zbytes = &_bytes_orig + + _pytype_replace_by_child( + bytes, &_bytes_orig, + _pybstr, "bstr(origin)", + bpreserve_slots) + pybstr = bytes # retarget pybstr -> bytes to where it was copied + (pybstr).tp_name = ("str" if PY_MAJOR_VERSION < 3 else "bytes") + + # need to remove unsupported slots in cloned bstr/ustr again since PyType_Ready might have recreated them + _bstrustr_remove_unsupported_slots() + + + # also patch UserString to have methods that bstr/ustr have + # else e.g. IPython's guarded_eval.py fails in `_list_methods(collections.UserString, dir(str))` + from six.moves import UserString + def userstr__bytes__(s): return bytes(s.data) + def userstr__unicode__(s): return unicode(s.data) + assert not hasattr(UserString, '__bytes__') # XXX test + assert not hasattr(UserString, '__unicode__') + UserString.__bytes__ = userstr__bytes__ + UserString.__unicode__ = userstr__unicode__ + + # XXX also patch CAPI functions ... XXX explain + #funchook_set_debug_file("/dev/stderr") + _patch_capi_hook = xfunchook_create() + + _patch_capi_str_format() + _patch_capi_object_str() + _patch_capi_unicode_decode_as_bstr() + _patch_str_pickle() + # ... + + xfunchook_install(_patch_capi_hook, 0) + + +# XXX place ok ? +include '_golang_str_pickle.pyx' + +# _pytype_clone clones PyTypeObject src into dst. +# dst must not be previously initialized. +# +# dst will have reference-count = 1 meaning new reference to it is returned. +cdef _pytype_clone(PyTypeObject *src, PyTypeObject *dst, const char* new_name): + assert (src.tp_flags & Py_TPFLAGS_READY) != 0 + assert (src.tp_flags & Py_TPFLAGS_HEAPTYPE) == 0 # src is not allocated on heap + #assert not PyType_IS_GC((src).ob_type) # XXX not true as unicode.ob_type is PyType_Type + # which generally has GC support, but + # GC is deactivated for non-heap types. + # copy the struct XXX + .ob_next / .ob_prev (Py_TRACE_REFS) + dst[0] = src[0] + (dst).ob_refcnt = 1 + + if new_name != NULL: + dst.tp_name = new_name + + # now reinitialize things like .tp_dict etc, where PyType_Ready built slots that point to src. + # we want all those slots to be rebuilt and point to dst instead. + _dst = <_XPyTypeObject*>dst + dst .tp_flags &= ~Py_TPFLAGS_READY + dst .tp_dict = NULL + _dst.tp_bases = NULL + _dst.tp_mro = NULL + _dst.tp_cache = NULL + _dst.tp_weaklist = NULL + + # dst.__subclasses__ will be empty because existing children inherit from src, not from dst. + _dst.tp_subclasses = NULL + + PyType_Ready(dst) + assert (dst.tp_flags & Py_TPFLAGS_READY) != 0 + +# _pytype_replace_by_child replaces typ by its child egg. +# +# All existing objects that have type typ will switch to having type egg' . +# The instance/inheritance diagram for existing objects and types will switch +# as depicted below: +# +# base base +# ↑ ↖ +# typ ------> egg' → typ_clone +# ↗ ↑ ↖ ↗ ↑ ↗ +# objects X egg objects X egg +# ↑ ↑ +# Y Y +# +# typ_clone must be initialized via _pytype_clone(typ, typ_clone). +# egg' is egg clone put inplace of typ +# +# XXX preserve_slots - describe +cdef _pytype_replace_by_child(PyTypeObject *typ, PyTypeObject *typ_clone, + PyTypeObject *egg, const char* egg_old_name, + preserve_slots): + otyp = typ ; oegg = egg + vtyp = typ ; vegg = egg + _typ = <_XPyTypeObject*>typ ; _egg = <_XPyTypeObject*>egg + + assert egg.tp_base == typ + assert _egg.tp_subclasses == NULL + + assert (typ.tp_flags & Py_TPFLAGS_READY) != 0 + assert (egg.tp_flags & Py_TPFLAGS_READY) != 0 + + assert (typ.tp_flags & Py_TPFLAGS_HEAPTYPE) == 0 + assert (egg.tp_flags & Py_TPFLAGS_HEAPTYPE) == 0 # XXX will be not true + # -> ! Py_TPFLAGS_HAVE_GC + # -> ? set Py_TPFLAGS_HEAPTYPE back on typ' ? + + # (generally not required) + assert (typ.tp_flags & Py_TPFLAGS_HAVE_GC) == 0 + assert (egg.tp_flags & Py_TPFLAGS_HAVE_GC) == 0 + # XXX also check PyObject_IS_GC (verifies .tp_is_gc() = n) ? + + + assert vtyp.ob_size == vegg.ob_size + assert typ .tp_basicsize == egg .tp_basicsize + assert typ .tp_itemsize == egg .tp_itemsize + IF PY3: + assert _typ.tp_vectorcall_offset == _egg.tp_vectorcall_offset + assert _typ.tp_weaklistoffset == _egg.tp_weaklistoffset + assert typ .tp_dictoffset == egg .tp_dictoffset + + # since egg will change .tp_base it will also need to reinitialize + # .tp_bases, .tp_mro and friends. Retrieve egg slots to preserve before we + # clear egg.__dict__ . This covers e.g. @staticmethod and @property. + keep_slots = {} # name -> slot + for name in preserve_slots: + keep_slots[name] = _get_slot(egg, name) + + # egg: clear what PyType_Ready will recompute + Py_CLEAR(egg .tp_dict) + Py_CLEAR(_egg.tp_bases) + Py_CLEAR(_egg.tp_mro) + Py_CLEAR(_egg.tp_cache) + + # typ <- egg preserving original typ's refcnt, weak references and subclasses\egg. + # typ will be now playing the role of egg + typ_refcnt = otyp.ob_refcnt + typ_weaklist = _typ.tp_weaklist + typ_subclasses = _typ.tp_subclasses + typ[0] = egg[0] + otyp.ob_refcnt = typ_refcnt + _typ.tp_weaklist = typ_weaklist + _typ.tp_subclasses = typ_subclasses # XXX need to remove egg from here + + # adjust .tp_base + typ.tp_base = typ_clone + egg.tp_base = typ_clone + + # adjust egg.tp_name + if egg_old_name != NULL: + egg.tp_name = egg_old_name + + # reinitialize .tp_bases, .tp_mro. .tp_cache, and recompute slots that + # live in .tp_dict and point to their type. Do it for both typ (new egg) + # and origin egg for generality, even though original egg won't be used + # anymore. + typ.tp_flags &= ~Py_TPFLAGS_READY + egg.tp_flags &= ~Py_TPFLAGS_READY + PyType_Ready(typ) + PyType_Ready(egg) + assert (typ.tp_flags & Py_TPFLAGS_READY) != 0 + assert (egg.tp_flags & Py_TPFLAGS_READY) != 0 + + # restore slots we were asked to preserve as is + # since those slots are e.g. @staticmethods they go to both egg' and egg. + for name, slot in keep_slots.items(): + _patch_slot(typ, name, slot, asis=True) + _patch_slot(egg, name, slot, asis=True) + + # XXX remove egg from typ.tp_subclasses (also possible via setting .__bases__) + # XXX remove typ from base.tp_subclasses + # else e.g. ustr(origin) is reported to be subclass of ustr by help() + # (pyustr.__subclasses__() give it) + + # rebuild .tp_mro of all other typ's children + # initially X.__mro__ = (X, typ, base) and without rebuilding it would + # remain (X, egg', base) instead of correct (X, egg' typ_clone, base) + # XXX py3 does this automatically? XXX -> no, it can invalidate .__mro__, but not .tp_mro + def refresh(x): + assert isinstance(x, type) + xtyp = x + _xtyp = <_XPyTypeObject*>x + fprintf(stderr, 'refreshing %s\n', xtyp.tp_name) + assert (xtyp.tp_flags & Py_TPFLAGS_READY) != 0 + xtyp.tp_flags &= ~Py_TPFLAGS_READY + Py_CLEAR(_xtyp.tp_mro) + PyType_Ready(x) + assert (xtyp.tp_flags & Py_TPFLAGS_READY) != 0 + for _ in x.__subclasses__(): + refresh(_) + for _ in (typ).__subclasses__(): + refresh(_) + + # XXX also preserve ._ob_next + ._ob_prev (present in Py_TRACE_REFS builds) diff --git a/golang/_golang_str_pickle.S b/golang/_golang_str_pickle.S new file mode 100644 index 0000000..3b954bc --- /dev/null +++ b/golang/_golang_str_pickle.S @@ -0,0 +1,371 @@ +// Copyright (C) 2023 Nexedi SA and Contributors. +// Kirill Smelkov +// +// This program is free software: you can Use, Study, Modify and Redistribute +// it under the terms of the GNU General Public License version 3, or (at your +// option) any later version, as published by the Free Software Foundation. +// +// You can also Link and Combine this program with other software covered by +// the terms of any of the Free Software licenses or any of the Open Source +// Initiative approved licenses and Convey the resulting work. Corresponding +// source of such a combination shall include the source code for all other +// software used. +// +// This program is distributed WITHOUT ANY WARRANTY; without even the implied +// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// See COPYING file for full licensing terms. +// See https://www.nexedi.com/licensing for rationale and options. + +// _golang_str_pickle.S complements _golang_str_pickle.pyx with assembly routines. + +#include "golang/runtime/platform.h" + + .text + .p2align 4 + +// CSYM returns assembler symbol for C-symbol name +#if defined(LIBGOLANG_OS_darwin) || \ + (defined(LIBGOLANG_OS_windows) && defined(LIBGOLANG_ARCH_386)) +# define CSYM(name) _ ## name +#else +# define CSYM(name) name +#endif + +// _TYPE emits `.type sym, symtype` on systems where .type directive is supported +// _SIZE emits `.size sym, symsize` on systems where .size directive is supported +#ifdef LIBGOLANG_OS_linux +# define _TYPE(sym, symtype) .type sym, symtype +# define _SIZE(sym, symsize) .size sym, symsize +#else +# define _TYPE(sym, type) +# define _SIZE(sym, size) +#endif + +// inside_counted provides trampoline to call *inside_counted_func with +// counting how many times that function entered inside and exited. +// +// Each enter increments inside_counter, while each exit decrements it. +// Recursion is supported up to STK_SIZE times with counter stopping to be +// adjusted at deeper recursion levels. +// +// inside_counted can be used on functions with arbitrary signatures because +// all registers and stack arguments are preserved exactly as is on the call(*). +// +// (*) NOTE₁ on-stack return address / link-register is adjusted during the call. +// this prevents inside_counted to be used with e.g. x86.get_pc_thunk.ax . +// NOTE₂ on ARM64 xip0 (x16) is clobbered. +#define inside_counted CSYM(inside_counted) +#define inside_counted_func CSYM(inside_counted_func) +#define inside_counter CSYM(inside_counter) +#define inside_counted_stk CSYM(inside_counted_stk) + .globl inside_counted + _TYPE( inside_counted, @function ) +inside_counted: +#define STK_SIZE 8 + +// ---- X86_64 / i386 ---- + +#if defined(LIBGOLANG_ARCH_amd64) || defined(LIBGOLANG_ARCH_386) +#if defined(LIBGOLANG_ARCH_amd64) +# define REGSIZE 8 +# define rAX rax +# define rPCNT rbx +# define rCNT rcx +# define rPSTK rdx +# define rSP rsp +# ifndef LIBGOLANG_OS_windows + .macro LEAGOT sym, reg + movq \sym@GOTPCREL(%rip), %\reg + .endm +# else + // windows does not use PIC and relocates DLLs when loading them + // there is no GOT and we need to access in-DLL symbols directly + // see e.g. https://stackoverflow.com/q/13309662/9456786 for details. + .macro LEAGOT sym, reg + leaq \sym(%rip), %\reg // NOTE pc-relative addressing used to avoid LNK2017: + .endm // 'ADDR32' relocation ... invalid without /LARGEADDRESSAWARE:NO +# endif +#else +# define REGSIZE 4 +# define rAX eax +# define rPCNT ebx +# define rCNT ecx +# define rPSTK edx +# define rSP esp +# ifndef LIBGOLANG_OS_windows + .macro LEAGOT sym, reg + call .Lget_pc_\reg + addl $_GLOBAL_OFFSET_TABLE_, %\reg + movl \sym@GOT(%\reg), %\reg + .endm +# else + // windows does not use PIC - see details in ^^^ amd64 case + .macro LEAGOT sym, reg + leal \sym, %\reg + .endm +# endif +#endif + + sub $REGSIZE, %rSP // make place for jmp-via-ret to *inside_counted_func + + // TODO consider adding cfi_* annotations, but probably it won't be simple + // since we manipulate retaddr on the stack + + push %rAX // save registers we'll use + push %rPCNT + push %rCNT + push %rPSTK +#define SP_JMPVIARET (4*REGSIZE) +#define SP_RETORIG (5*REGSIZE) + + // jmp-via-ret = *inside_counted_func + LEAGOT inside_counted_func, rAX // &inside_counted_func + mov (%rAX), %rAX // inside_counted_func + mov %rAX, SP_JMPVIARET(%rSP) + + // check whether altstk is overflowed + // if it is - invoke the func without counting + LEAGOT inside_counter, rPCNT // &inside_counter + mov (%rPCNT), %rCNT // inside_counter + cmp $STK_SIZE, %rCNT + jge .Lcall + + // altstk is not overflowed + // push original ret to altstk and replace the ret to return to us after the call + LEAGOT inside_counted_stk, rPSTK // &inside_counted_stk + mov SP_RETORIG(%rSP), %rAX // original ret address + mov %rAX, (%rPSTK,%rCNT,REGSIZE) // inside_counted_stk[inside_counter] = retorig + add $1, %rCNT // inside_counter++ + mov %rCNT, (%rPCNT) + +#if defined(LIBGOLANG_ARCH_amd64) + lea .Laftercall(%rip), %rAX +#else + call .Lget_pc_eax + add $(.Laftercall-.), %rAX +#endif + mov %rAX, SP_RETORIG(%rSP) // replace ret addr on stack to .Laftercall + +.Lcall: + // restore registers and invoke the func through jmp-via-ret + pop %rPSTK + pop %rCNT + pop %rPCNT + pop %rAX + ret + +.Laftercall: + // we get here after invoked func returns if altstk was not overflowed + // decrement inside_counter and return to original ret address + sub $REGSIZE, %rSP // make place for original ret + push %rAX // save registers + push %rPCNT + push %rCNT + push %rPSTK +#undef SP_RETORIG +#define SP_RETORIG (4*REGSIZE) + + LEAGOT inside_counter, rPCNT // &inside_counter + mov (%rPCNT), %rCNT // inside_counter + sub $1, %rCNT + mov %rCNT, (%rPCNT) // inside_counter-- + LEAGOT inside_counted_stk, rPSTK // &inside_counted_stk + mov (%rPSTK,%rCNT,REGSIZE), %rAX // retorig = inside_counted_stk[inside_counter] + mov %rAX, SP_RETORIG(%rSP) + + // restore registers and return to original caller + pop %rPSTK + pop %rCNT + pop %rPCNT + pop %rAX + ret + +#if defined(LIBGOLANG_ARCH_386) +.macro DEF_get_pc reg + .Lget_pc_\reg: + mov (%esp), %\reg + ret +.endm +DEF_get_pc eax +DEF_get_pc ebx +DEF_get_pc ecx +DEF_get_pc edx +#endif + +// ---- ARM64 ---- + +#elif defined(LIBGOLANG_ARCH_arm64) +#define REGSIZE 8 +#define rPCNT x0 +#define rCNT x1 +#define rPSTK x2 +#define rXIP0 x16 + stp rPCNT, rCNT, [sp, -16]! // save registers we'll use + stp rPSTK, xzr, [sp, -16]! // NOTE xip0 is clobbered + + // xip0 = *inside_counted_func + adrp rXIP0, :got:inside_counted_func + ldr rXIP0, [rXIP0, :got_lo12:inside_counted_func] // &inside_counted_func + ldr rXIP0, [rXIP0] // inside_counted_func + + // check whether altstk is overflowed + // if it is - invoke the func without counting + adrp rPCNT, :got:inside_counter + ldr rPCNT, [rPCNT, :got_lo12:inside_counter] // &inside_counter + ldr rCNT, [rPCNT] // inside_counter + cmp rCNT, STK_SIZE + bge .Lcall + + // altstk is not overflowed + // push original ret to altstk and replace the ret to return to us after the call + adrp rPSTK, :got:inside_counted_stk + ldr rPSTK, [rPSTK, :got_lo12:inside_counted_stk] // &inside_counted_stk + str lr, [rPSTK, rCNT, lsl 3] // inside_counted_stk[inside_counter] = retorig + add rCNT, rCNT, 1 // inside_counter++ + str rCNT, [rPCNT] + + adr lr, .Laftercall // replace ret addr to .Laftercall + +.Lcall: + // restore registers and invoke the func via xip0 + ldp rPSTK, xzr, [sp], 16 + ldp rPCNT, rCNT, [sp], 16 + br rXIP0 + +.Laftercall: + // we get here after invoked func returns if altstk was not overflowed + // decrement inside_counter and return to original ret address + stp rPCNT, rCNT, [sp, -16]! // save registers + stp rPSTK, xzr, [sp, -16]! + + adrp rPCNT, :got:inside_counter + ldr rPCNT, [rPCNT, :got_lo12:inside_counter] // &inside_counter + ldr rCNT, [rPCNT] // inside_counter + sub rCNT, rCNT, 1 + str rCNT, [rPCNT] // inside_counter-- + adrp rPSTK, :got:inside_counted_stk + ldr rPSTK, [rPSTK, :got_lo12:inside_counted_stk] // &inside_counted_stk + ldr lr, [rPSTK, rCNT, lsl 3] // lr = inside_counted_stk[inside_counter] + + // restore registers and return to original caller + ldp rPSTK, xzr, [sp], 16 + ldp rPCNT, rCNT, [sp], 16 + ret + +#else +# error "unsupported architecture" +#endif + + _SIZE( inside_counted, .-inside_counted ) + +// ---- data --- + .bss + +// void* inside_counted_func + .globl inside_counted_func + .p2align 3 // 8 + _TYPE( inside_counted_func, @object ) + _SIZE( inside_counted_func, REGSIZE ) +inside_counted_func: + .zero REGSIZE + +// long inside_counter + .globl inside_counter + .p2align 3 // 8 + _TYPE( inside_counter, @object ) + _SIZE( inside_counter, REGSIZE ) +inside_counter: + .zero REGSIZE + +// void* inside_counted_stk[STK_SIZE] + .globl inside_counted_stk + .p2align 5 // 32 + _TYPE( inside_counted_stk, @object ) + _SIZE( inside_counted_stk, STK_SIZE*REGSIZE ) +inside_counted_stk: + .zero STK_SIZE*REGSIZE + + +// disable executable stack +#ifndef LIBGOLANG_OS_windows + .section .note.GNU-stack,"",@progbits +#endif + + +// ---- custom callconv proxies ---- + .text + .p2align 4 + +// saveprobe_ (self, obj, pers_save) input callconv, proxy to saveprobe +// _pickle_Pickler_xsave_(self, obj, pers_save) input callconv, proxy to _pickle_Pickler_xsave +// save_invoke_as_ (save, self, obj, pers_save) input std, proxy to save invoked via callconv + + +#if defined(LIBGOLANG_ARCH_386) + +#ifdef LIBGOLANG_CC_msc +# define CSYM_FASTCALL3(name) @name@12 // MSVC mangles __fastcall +# define CSYM_FASTCALL4(name) @name@16 +#else +# define CSYM_FASTCALL3(name) CSYM(name) +# define CSYM_FASTCALL4(name) CSYM(name) +#endif + +// python-3.11.5.exe has _pickle.save accepting arguments in ecx,edx,stack but +// contrary to fastcall the callee does not cleanup the stack. +// Handle this as fastcall_nostkclean + +.macro FUNC_fastcall_nostkclean name + .globl CSYM(\name\()_fastcall_nostkclean) + _TYPE( CSYM(\name\()_fastcall_nostkclean), @function ) +CSYM(\name\()_fastcall_nostkclean): + // we are proxying to fastcall - ecx and edx are already setup and we + // need to only duplicate the 3rd argument on the stack. Do this without + // clobbering any register. + sub $4, %esp // place to copy on-stack argument to + push %eax + mov 12(%esp), %eax // original on-stack arg + mov %eax, 4(%esp) // dup to copy + pop %eax + + call CSYM_FASTCALL3(\name\()_ifastcall) + // ^^^ cleaned up the stack from our copy + // nothing to do anymore + ret + _SIZE( CSYM(\name\()_fastcall_nostkclean), .-CSYM(\name\()_fastcall_nostkclean) ) +.endm +FUNC_fastcall_nostkclean saveprobe +FUNC_fastcall_nostkclean _pickle_Pickler_xsave +FUNC_fastcall_nostkclean _zpickle_Pickler_xsave + +#define save_invoke_as_fastcall_nostkclean CSYM_FASTCALL4(save_invoke_as_fastcall_nostkclean) + .globl save_invoke_as_fastcall_nostkclean + _TYPE( save_invoke_as_fastcall_nostkclean, @function ) +save_invoke_as_fastcall_nostkclean: + // input: + // ecx: save + // edx: self + // stk[1]: obj + // stk[2]: pers_save + // + // invoke save as: + // ecx: self + // edx: obj + // stk*[1]: pers_save + + mov 8(%esp), %eax // pers_save + push %eax // stk*[1] <- per_save + + mov %ecx, %eax // eax <- save + mov %edx, %ecx // ecx <- self + mov (4+4)(%esp), %edx // edx <- obj + + call *%eax + + // return with cleaning up stack + add $4, %esp // pers_save copy we created + ret $8 // original arguments + _SIZE( save_invoke_as_fastcall_nostkclean, .-save_invoke_as_fastcall_nostkclean) + +#endif // 386 diff --git a/golang/_golang_str_pickle.pyx b/golang/_golang_str_pickle.pyx new file mode 100644 index 0000000..ec091c2 --- /dev/null +++ b/golang/_golang_str_pickle.pyx @@ -0,0 +1,1325 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2023 Nexedi SA and Contributors. +# Kirill Smelkov +# +# This program is free software: you can Use, Study, Modify and Redistribute +# it under the terms of the GNU General Public License version 3, or (at your +# option) any later version, as published by the Free Software Foundation. +# +# You can also Link and Combine this program with other software covered by +# the terms of any of the Free Software licenses or any of the Open Source +# Initiative approved licenses and Convey the resulting work. Corresponding +# source of such a combination shall include the source code for all other +# software used. +# +# This program is distributed WITHOUT ANY WARRANTY; without even the implied +# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# See COPYING file for full licensing terms. +# See https://www.nexedi.com/licensing for rationale and options. +"""_golang_str_pickle.pyx complements _golang_str.pyx and keeps everything +related to pickling strings. + +It is included from _golang_str.pyx . + +The main entry-points are _patch_str_pickle and _patch_capi_unicode_decode_as_bstr. +""" + +from cpython cimport PyUnicode_Decode +from cpython cimport PyBytes_FromStringAndSize, _PyBytes_Resize + +cdef extern from "Python.h": + char* PyBytes_AS_STRING(PyObject*) + Py_ssize_t PyBytes_GET_SIZE(PyObject*) + +cdef extern from "Python.h": + ctypedef PyObject* (*PyCFunction)(PyObject*, PyObject*) + ctypedef struct PyMethodDef: + const char* ml_name + PyCFunction ml_meth + ctypedef struct PyCFunctionObject: + PyMethodDef *m_ml + PyObject* m_self + PyObject* m_module + +cdef extern from "structmember.h": + ctypedef struct PyMemberDef: + const char* name + int type + Py_ssize_t offset + + enum: + T_INT + +from libc.stdlib cimport malloc, free +from libc.string cimport memcpy, memcmp + +if PY_MAJOR_VERSION >= 3: + import copyreg as pycopyreg +else: + import copy_reg as pycopyreg + +cdef object zbinary # = zodbpickle.binary | None +try: + import zodbpickle +except ImportError: + zbinary = None +else: + zbinary = zodbpickle.binary + + +# support for pickling bstr/ustr as standalone types. +# +# pickling is organized in such a way that +# - what is saved by py2 can be loaded correctly on both py2/py3, and similarly +# - what is saved by py3 can be loaded correctly on both py2/py3 as well. +# +# XXX place +cdef _bstr__reduce_ex__(self, protocol): + # Ideally we want to emit bstr(BYTES), but BYTES is not available for + # protocol < 3. And for protocol < 3 emitting bstr(STRING) is not an + # option because plain py3 raises UnicodeDecodeError on loading arbitrary + # STRING data. However emitting bstr(UNICODE) works universally because + # pickle supports arbitrary unicode - including invalid unicode - out of + # the box and in exactly the same way on both py2 and py3. For the + # reference upstream py3 uses surrogatepass on encode/decode UNICODE data + # to achieve that. + if protocol < 3: + # use UNICODE for data + udata = _udata(pyu(self)) + if protocol < 2: + return (self.__class__, (udata,)) # bstr UNICODE REDUCE + else: + return (pycopyreg.__newobj__, + (self.__class__, udata)) # bstr UNICODE NEWOBJ + else: + # use BYTES for data + bdata = _bdata(self) + if PY_MAJOR_VERSION < 3: + # the only way we can get here on py2 and protocol >= 3 is zodbpickle + # -> similarly to py3 save bdata as BYTES + assert zbinary is not None + bdata = zbinary(bdata) + return ( + pycopyreg.__newobj__, # bstr BYTES NEWOBJ + (self.__class__, bdata)) + +cdef _ustr__reduce_ex__(self, protocol): + # emit ustr(UNICODE). + # TODO later we might want to switch to emitting ustr(BYTES) + # even if we do this, it should be backward compatible + if protocol < 2: + return (self.__class__, (_udata(self),))# ustr UNICODE REDUCE + else: + return (pycopyreg.__newobj__, # ustr UNICODE NEWOBJ + (self.__class__, _udata(self))) + + + +# types used while patching +cdef extern from *: + """ + struct PicklerObject; + """ + struct PicklerObject: + pass + +cdef struct PicklerTypeInfo: + Py_ssize_t size # sizeof(PicklerObject) + Py_ssize_t off_bin # offsetof `int bin` + Py_ssize_t off_poutput_buffer # offsetof `PyObject *output_buffer` + Py_ssize_t off_output_len # offsetof `Py_ssize_t output_len` + Py_ssize_t off_max_output_len # offsetof `Py_ssize_t max_output_len` + + +# XXX place ? +cdef extern from * nogil: + r""" + // CALLCONV instructs compiler to use specified builtin calling convention. + // it should be used like this: + // + // int CALLCONV(stdcall) myfunc(...) + #ifndef LIBGOLANG_CC_msc + # define CALLCONV(callconv) __attribute__((callconv)) + #else // MSC + # define CALLCONV(callconv) __##callconv + #endif + + + // FOR_EACH_CALLCONV invokes macro X(ccname, callconv, cckind) for every supported calling convention. + // cckind is one of `builtin` or `custom`. + #ifdef LIBGOLANG_ARCH_386 + # ifndef LIBGOLANG_CC_msc + # define FOR_EACH_CALLCONV(X) \ + X(default,, builtin) \ + X(cdecl, CALLCONV(cdecl), builtin) \ + X(stdcall, CALLCONV(stdcall), builtin) \ + X(fastcall, CALLCONV(fastcall), builtin) \ + X(thiscall, CALLCONV(thiscall), builtin) \ + X(regparm1, CALLCONV(regparm(1)), builtin) \ + X(regparm2, CALLCONV(regparm(2)), builtin) \ + X(regparm3, CALLCONV(regparm(3)), builtin) \ + X(fastcall_nostkclean, na, custom ) + # else // MSC + # define FOR_EACH_CALLCONV(X) \ + X(default,, builtin) \ + X(cdecl, CALLCONV(cdecl), builtin) \ + X(stdcall, CALLCONV(stdcall), builtin) \ + X(fastcall, CALLCONV(fastcall), builtin) \ + /* X(CALLCONV(thiscall), thiscall) MSVC emits "C3865: '__thiscall': can only be used on native member functions" */ \ + /* in theory we can emulate thiscall via fastcall https://tresp4sser.wordpress.com/2012/10/06/how-to-hook-thiscall-functions/ */ \ + X(vectorcall, CALLCONV(vectorcall), builtin) \ + X(fastcall_nostkclean, na, custom ) + # endif + #elif defined(LIBGOLANG_ARCH_amd64) + # define FOR_EACH_CALLCONV(X) \ + X(default,, builtin) + #elif defined(LIBGOLANG_ARCH_arm64) + # define FOR_EACH_CALLCONV(X) \ + X(default,, builtin) + #else + # error "unsupported architecture" + #endif + + // Callconv denotes calling convention of a function. + enum Callconv { + #define CC_ENUM1(ccname, _, __) \ + CALLCONV_##ccname, + FOR_EACH_CALLCONV(CC_ENUM1) + }; + + const char* callconv_str(Callconv cconv) { + using namespace golang; + switch(cconv) { + #define CC_STR1(ccname, _, __) \ + case CALLCONV_##ccname: \ + return #ccname; + FOR_EACH_CALLCONV(CC_STR1) + default: + panic("bug"); + } + } + + // SaveFunc represents a save function - its address and calling convention. + struct SaveFunc { + void* addr; + Callconv cconv; + }; + """ + enum Callconv: pass + const char* callconv_str(Callconv) + struct SaveFunc: + void* addr + Callconv cconv + +# XXX doc +cdef struct _pickle_PatchCtx: + initproc Unpickler_tp_xinit # func to replace Unpickler.tp_init + initproc Unpickler_tp_init_orig # what was there before + + vector[SaveFunc] Pickler_xsave_ccv # func to replace _Pickler_save (all callconv variants) + SaveFunc Pickler_save_orig # what was there before + + PicklerTypeInfo iPickler # information detected about PicklerObject type + + +# patch contexts for _pickle and _zodbpickle modules +cdef _pickle_PatchCtx _pickle_patchctx +cdef _pickle_PatchCtx _zpickle_patchctx + + +# _patch_str_pickle patches *pickle modules to support bstr/ustr and UTF-8 properly. +# +# STRING opcodes are handled in backward-compatible way: +# +# - *STRING are loaded as bstr +# - bstr is saved as *STRING +# - pickletools decodes *STRING as UTF-8 +cdef _patch_str_pickle(): + try: + import zodbpickle + except ImportError: + zodbpickle = None + + # py3: pickletools.dis raises UnicodeDecodeError on non-ascii STRING and treats *BINSTRING as latin1 + # -> decode as UTF8b instead + if PY_MAJOR_VERSION >= 3: + import pickletools, codecs + _codecs_escape_decode = codecs.escape_decode + def xread_stringnl(f): + data = _codecs_escape_decode(pickletools.read_stringnl(f, decode=False))[0] + return pybstr(data) + def xread_string1(f): + data = pickletools.read_string1(f).encode('latin1') + return pybstr(data) + def xread_string4(f): + data = pickletools.read_string4(f).encode('latin1') + return pybstr(data) + + pickletools.stringnl.reader = xread_stringnl + pickletools.string1.reader = xread_string1 + pickletools.string4.reader = xread_string4 + + if zodbpickle: + from zodbpickle import pickletools_3 as zpickletools + zpickletools.stringnl.reader = xread_stringnl # was same logic as in std pickletools + zpickletools.string1.reader = xread_string1 + zpickletools.string4.reader = xread_string4 + + # py3: pickle.load wants to treat *STRING as bytes and decode it as ASCII + # -> adjust to decode to bstr instead + # -> also save bstr via *STRING opcodes so that load/save is identity + import pickle, _pickle + # TODO _pickle not available (pypy) + _pickle_patchctx.Unpickler_tp_xinit = _pickle_Unpickler_xinit + _pickle_patchctx.Pickler_xsave_ccv = _pickle_Pickler_xsave_ccv + _patch_pickle(pickle, _pickle, &_pickle_patchctx) + + if zodbpickle: + from zodbpickle import pickle as zpickle, _pickle as _zpickle + from zodbpickle import slowpickle as zslowPickle, fastpickle as zfastPickle + # TODO _pickle / fastpickle not available (pypy) + for x in 'load', 'loads', 'Unpickler', 'dump', 'dumps', 'Pickler': + assert getattr(_zpickle, x) is getattr(zfastPickle, x) + assert getattr(zpickle, x) is getattr(_zpickle, x) + _patch_pickle(zslowPickle, None, NULL) + _zpickle_patchctx.Unpickler_tp_xinit = _zpickle_Unpickler_xinit + _zpickle_patchctx.Pickler_xsave_ccv = _zpickle_Pickler_xsave_ccv + _patch_pickle(None, zfastPickle, &_zpickle_patchctx) + # propagate changes from fastpickle -> _zpickle -> zpickle + _zpickle.load = zfastPickle.load + _zpickle.loads = zfastPickle.loads + _zpickle.dump = zfastPickle.dump + _zpickle.dumps = zfastPickle.dumps + assert _zpickle.Unpickler is zfastPickle.Unpickler + assert _zpickle.Pickler is zfastPickle.Pickler + zpickle.load = zfastPickle.load + zpickle.loads = zfastPickle.loads + zpickle.dump = zfastPickle.dump + zpickle.dumps = zfastPickle.dumps + assert zpickle.Unpickler is zfastPickle.Unpickler + assert zpickle.Pickler is zfastPickle.Pickler + +# _patch_pickle serves _patch_str_pickle by patching pair of py-by-default and +# C implementations of a pickle module. +# +# pickle or _pickle being None indicates that corresponding module version is not available. +cdef _patch_pickle(pickle, _pickle, _pickle_PatchCtx* _pctx): + # if C module is available - it should shadow default py implementation + if _pickle is not None and pickle is not None: + assert pickle.load is _pickle.load + assert pickle.loads is _pickle.loads + assert pickle.Unpickler is _pickle.Unpickler + assert pickle.dump is _pickle.dump + assert pickle.dumps is _pickle.dumps + assert pickle.Pickler is _pickle.Pickler + + # patch C + if _pickle is not None: + _patch_cpickle(_pickle, _pctx) + # propagate C updates to py + if pickle is not None: + pickle.load = _pickle.load + pickle.loads = _pickle.loads + pickle.Unpickler = _pickle.Unpickler + pickle.dump = _pickle.dump + pickle.dumps = _pickle.dumps # XXX needed? + pickle.Pickler = _pickle.Pickler + + # patch py + if pickle is not None: + _patch_pypickle(pickle, shadowed = (_pickle is not None)) + +# _patch_pypickle serves _patch_pickle for py version. +cdef _patch_pypickle(pickle, shadowed): + def pyattr(name): + if shadowed: + name = '_'+name + return getattr(pickle, name) + + # adjust load / loads / Unpickler to use 'bstr' encoding by default + Unpickler = pyattr('Unpickler') + for f in pyattr('load'), pyattr('loads'), Unpickler.__init__: + f.__kwdefaults__['encoding'] = 'bstr' + + # patch Unpickler._decode_string to handle 'bstr' encoding + # zodbpickle uses .decode_string from first version of patch from bugs.python.org/issue6784 + has__decode = hasattr(Unpickler, '_decode_string') + has_decode = hasattr(Unpickler, 'decode_string') + assert has__decode or has_decode + assert not (has__decode and has_decode) + _decode_string = '_decode_string' if has__decode else 'decode_string' + + Unpickler_decode_string = getattr(Unpickler, _decode_string) + def _xdecode_string(self, value): + if self.encoding == 'bstr': + return pyb(value) + else: + return Unpickler_decode_string(self, value) + setattr(Unpickler, _decode_string, _xdecode_string) + + # adjust Pickler to save bstr as STRING + from struct import pack + Pickler = pyattr('Pickler') + def save_bstr(self, obj): + cdef bint nonascii_escape # unused + if self.proto >= 1: + n = len(obj) + if n < 256: + op = b'U' + bytes((n,)) + _bdata(obj) # SHORT_BINSTRING + else: + op = b'T' + pack(' wrap functions + _pickle_load = _pickle.load + _pickle_loads = _pickle.loads + def load (file, *, **kw): + kw.setdefault('encoding', 'bstr') + return _pickle_load (file, **kw) + def loads(data, *, **kw): + kw.setdefault('encoding', 'bstr') + return _pickle_loads(data, **kw) + _pickle.load = load + _pickle.loads = loads + + # adjust Unpickler to use 'bstr' encoding by default + assert isinstance(_pickle.Unpickler, type) + cdef _XPyTypeObject* Unpickler = <_XPyTypeObject*>(_pickle.Unpickler) + + pctx.Unpickler_tp_init_orig = Unpickler.tp_init + Unpickler.tp_init = pctx.Unpickler_tp_xinit + + def Unpickler_x__init__(self, *argv, **kw): + # NOTE don't return - just call: __init__ should return None + pctx.Unpickler_tp_xinit(self, argv, kw) + + _patch_slot(Unpickler, '__init__', Unpickler_x__init__) + # decoding to bstr relies on _patch_capi_unicode_decode_as_bstr + + # adjust Pickler to save bstr as *STRING + # it is a bit involved because: + # - save function, that we need to patch, is not exported. + # - _Pickle_Write, that we need to use from patched save, is not exported neither. + pctx.iPickler = _detect_Pickler_typeinfo(_pickle.Pickler) + pctx.Pickler_save_orig = save = _find_Pickler_save(_pickle.Pickler) + xsave = pctx.Pickler_xsave_ccv[save.cconv] + assert xsave.cconv == save.cconv, (callconv_str(xsave.cconv), callconv_str(save.cconv)) + cpatch(&pctx.Pickler_save_orig.addr, xsave.addr) + + # XXX test at runtime that we hooked save correctly + + +# ---- adjusted C bits for loading ---- + +# adjust Unpickler to use 'bstr' encoding by default and handle that encoding +# in PyUnicode_Decode by returning bstr instead of unicode. This mirrors +# corresponding py loading adjustments. + +cdef int _pickle_Unpickler_xinit(object self, PyObject* args, PyObject* kw) except -1: + xkw = {'encoding': 'bstr'} + if kw != NULL: + xkw.update(kw) + return _pickle_patchctx.Unpickler_tp_init_orig(self, args, xkw) + +cdef int _zpickle_Unpickler_xinit(object self, PyObject* args, PyObject* kw) except -1: + xkw = {'encoding': 'bstr'} + if kw != NULL: + xkw.update(kw) + return _zpickle_patchctx.Unpickler_tp_init_orig(self, args, xkw) + +ctypedef object unicode_decodefunc(const char*, Py_ssize_t, const char* encoding, const char* errors) +cdef unicode_decodefunc* _punicode_Decode +cdef object _unicode_xDecode(const char *s, Py_ssize_t size, const char* encoding, const char* errors): + if encoding != NULL and strcmp(encoding, 'bstr') == 0: + bobj = PyBytes_FromStringAndSize(s, size) # TODO -> PyBSTR_FromStringAndSize directly + return pyb(bobj) + return _punicode_Decode(s, size, encoding, errors) + +cdef _patch_capi_unicode_decode_as_bstr(): + global _punicode_Decode + _punicode_Decode = PyUnicode_Decode + cpatch(&_punicode_Decode, _unicode_xDecode) + + +# ---- adjusted C bits for saving ---- + +# adjust Pickler save to save bstr via *STRING opcodes. +# This mirrors corresponding py saving adjustments, but is more involved to implement. + +cdef int _pickle_Pickler_xsave(PicklerObject* self, PyObject* obj, int pers_save) except -1: + return __Pickler_xsave(&_pickle_patchctx, self, obj, pers_save) + +cdef int _zpickle_Pickler_xsave(PicklerObject* self, PyObject* obj, int pers_save) except -1: + return __Pickler_xsave(&_zpickle_patchctx, self, obj, pers_save) + +# callconv wrappers XXX place +cdef extern from *: + r""" + static int __pyx_f_6golang_7_golang__pickle_Pickler_xsave(PicklerObject*, PyObject*, int); + static int __pyx_f_6golang_7_golang__zpickle_Pickler_xsave(PicklerObject*, PyObject*, int); + + #define DEF_PICKLE_XSAVE_builtin(ccname, callconv) \ + static int callconv \ + _pickle_Pickler_xsave_##ccname(PicklerObject* self, PyObject* obj, int pers_save) { \ + return __pyx_f_6golang_7_golang__pickle_Pickler_xsave(self, obj, pers_save); \ + } + #define DEF_ZPICKLE_XSAVE_builtin(ccname, callconv) \ + static int callconv \ + _zpickle_Pickler_xsave_##ccname(PicklerObject* self, PyObject* obj, int pers_save) { \ + return __pyx_f_6golang_7_golang__zpickle_Pickler_xsave(self, obj, pers_save); \ + } + + #define DEF_PICKLE_XSAVE_custom(ccname, _) \ + extern "C" char _pickle_Pickler_xsave_##ccname; + #define DEF_ZPICKLE_XSAVE_custom(ccname, _) \ + extern "C" char _zpickle_Pickler_xsave_##ccname; + + #define DEF_PICKLE_XSAVE(ccname, callconv, cckind) DEF_PICKLE_XSAVE_##cckind(ccname, callconv) + #define DEF_ZPICKLE_XSAVE(ccname, callconv, cckind) DEF_ZPICKLE_XSAVE_##cckind(ccname, callconv) + + FOR_EACH_CALLCONV(DEF_PICKLE_XSAVE) + FOR_EACH_CALLCONV(DEF_ZPICKLE_XSAVE) + + static std::vector _pickle_Pickler_xsave_ccv = { + #define PICKLE_CC_XSAVE(ccname, _, __) \ + SaveFunc{(void*)&_pickle_Pickler_xsave_##ccname, CALLCONV_##ccname}, + FOR_EACH_CALLCONV(PICKLE_CC_XSAVE) + }; + + static std::vector _zpickle_Pickler_xsave_ccv = { + #define ZPICKLE_CC_XSAVE(ccname, _, __) \ + SaveFunc{(void*)&_zpickle_Pickler_xsave_##ccname, CALLCONV_##ccname}, + FOR_EACH_CALLCONV(ZPICKLE_CC_XSAVE) + }; + + // proxy for asm routines to invoke _pickle_Pickler_xsave and _zpickle_Pickler_xsave + #ifdef LIBGOLANG_ARCH_386 + extern "C" int CALLCONV(fastcall) + _pickle_Pickler_xsave_ifastcall(PicklerObject* self, PyObject* obj, int pers_save) { + return __pyx_f_6golang_7_golang__pickle_Pickler_xsave(self, obj, pers_save); + } + extern "C" int CALLCONV(fastcall) + _zpickle_Pickler_xsave_ifastcall(PicklerObject* self, PyObject* obj, int pers_save) { + return __pyx_f_6golang_7_golang__zpickle_Pickler_xsave(self, obj, pers_save); + } + #endif + """ + vector[SaveFunc] _pickle_Pickler_xsave_ccv + vector[SaveFunc] _zpickle_Pickler_xsave_ccv + + +cdef int __Pickler_xsave(_pickle_PatchCtx* pctx, PicklerObject* self, PyObject* obj, int pers_save) except -1: + # !bstr -> use builtin pickle code + if obj.ob_type != pybstr: + return save_invoke(pctx.Pickler_save_orig.addr, pctx.Pickler_save_orig.cconv, + self, obj, pers_save) + + # bstr -> pickle it as *STRING + cdef const char* s + cdef Py_ssize_t l + cdef byte[5] h + cdef Py_ssize_t lh = 1; + cdef bint nonascii_escape + + cdef int bin = (((self) + pctx.iPickler.off_bin))[0] + if bin == 0: + esc = strconv._quote(obj, "'", &nonascii_escape) + assert type(esc) is bytes + s = PyBytes_AS_STRING(esc) + l = PyBytes_GET_SIZE(esc) + __Pickler_xWrite(pctx, self, b'S', 1) # STRING + __Pickler_xWrite(pctx, self, s, l) + __Pickler_xWrite(pctx, self, b'\n', 1) + + else: + s = PyBytes_AS_STRING(obj) + l = PyBytes_GET_SIZE(obj) + if l < 0x100: + h[0] = b'U' # SHORT_BINSTRING + h[1] = l + lh += 1 + elif l < 0x7fffffff: + h[0] = b'T' # BINSTRING + h[1] = (l >> 0) + h[2] = (l >> 8) + h[3] = (l >> 16) + h[4] = (l >> 24) + lh += 4 + else: + raise OverflowError("cannot serialize a string larger than 2 GiB") + + __Pickler_xWrite(pctx, self, h, lh) + __Pickler_xWrite(pctx, self, s, l) + + return 0 + + +# __Pickler_xWrite mimics original _Pickler_Write. +# +# we have to implement it ourselves because there is no way to discover +# original _Pickler_Write address: contrary to `save` function _Pickler_Write +# is small and is not recursive. A compiler is thus free to create many +# versions of it with e.g. constant propagation and to inline it freely. The +# latter actually happens for real on LLVM which for py3.11 inlines +# _Pickler_Write fully without leaving any single freestanding instance of it. +# +# XXX explain why we can skip flush in zpickle case +# XXX explain that we do not emit FRAME +cdef int __Pickler_xWrite(_pickle_PatchCtx* pctx, PicklerObject* self, const char* s, Py_ssize_t l) except -1: + ppoutput_buffer = (self + pctx.iPickler.off_poutput_buffer) + poutput_len = (self + pctx.iPickler.off_output_len) + pmax_output_len = (self + pctx.iPickler.off_max_output_len) + + assert ppoutput_buffer[0].ob_type == &PyBytes_Type + assert l >= 0 + assert poutput_len[0] >= 0 + + if l > PY_SSIZE_T_MAX - poutput_len[0]: + raise MemoryError() # overflow + + need = poutput_len[0] + l + if need > pmax_output_len[0]: + if need >= PY_SSIZE_T_MAX // 2: + raise MemoryError() + pmax_output_len[0] = need // 2 * 3 + _PyBytes_Resize(ppoutput_buffer, pmax_output_len[0]) + + buf = PyBytes_AS_STRING(ppoutput_buffer[0]) + memcpy(buf + poutput_len[0], s, l) + poutput_len[0] += l + + return 0 + + +# ---- infrastructure to assist patching C saving codepath ---- + +# _detect_Pickler_typeinfo detects information about PicklerObject type +# through runtime introspection. +# +# This information is used mainly by __Pickler_xWrite. +cdef PicklerTypeInfo _detect_Pickler_typeinfo(pyPickler) except *: + cdef PicklerTypeInfo t + + cdef bint debug = False + def trace(*argv): + if debug: + print(*argv) + trace() + + assert isinstance(pyPickler, type) + cdef PyTypeObject* Pickler = pyPickler + cdef _XPyTypeObject* xPickler = <_XPyTypeObject*> pyPickler + + # sizeof + assert Pickler.tp_basicsize > 0 + assert Pickler.tp_itemsize == 0 + t.size = Pickler.tp_basicsize + trace('size:\t', t.size) + + # busy keeps offsets of all bytes for already detected fields + busy = set() + def markbusy(off, size): + for _ in range(off, off+size): + assert _ not in busy, (_, busy) + assert 0 < off <= t.size + busy.add(_) + + # .bin + cdef PyMemberDef* mbin = tp_members_lookup(xPickler.tp_members, 'bin') + assert mbin.type == T_INT, (mbin.type,) + t.off_bin = mbin.offset + markbusy(t.off_bin, sizeof(int)) + trace('.bin:\t', t.off_bin) + + # .output_buffer + # + # 1) new Pickler + # 2) .memo = {} - the only pointer that changes is .memo (PyMemoTable* - not pyobject) + # 3) .tp_clear() - all changed words are changed to 0 and cover non-optional PyObject* and memo + # 4) .__init__() + # 5) go through offsets of all pyobjects and find the one with .ob_type = PyBytes_Type + # -> that is .output_buffer + + # 1) + class Null: + def write(self, data): pass + pyobj = pyPickler(Null()) + cdef PyObject* obj = pyobj + assert obj.ob_type == Pickler + + cdef byte* bobj = obj + cdef byte* bobj2 = malloc(t.size) + # obj_copy copies obj to obj2. + def obj_copy(): + memcpy(bobj2, bobj, t.size) + # obj_diff finds difference in between obj2 and obj. + def obj_diff(Py_ssize_t elemsize): # -> []offset + assert (elemsize & (elemsize - 1)) == 0, elemsize # elemsize is 2^x + cdef Py_ssize_t off + + # skip PyObject_HEAD + off = sizeof(PyObject) + off = (off + elemsize - 1) & (~(elemsize - 1)) + assert off % elemsize == 0 + + # find out offsets of different elements + vdelta = [] + while off + elemsize <= t.size: + if memcmp(bobj + off, bobj2 + off, elemsize): + vdelta.append(off) + off += elemsize + + return vdelta + + # 2) + obj_copy() + pyobj.memo = {} + dmemo = obj_diff(sizeof(void*)) + assert len(dmemo) == 1, dmemo + off_memo = dmemo[0] + markbusy(off_memo, sizeof(void*)) + trace('.memo:\t', off_memo) + + # 3) + assert Pickler.tp_clear != NULL + obj_copy() + Pickler.tp_clear(pyobj) + pointers = obj_diff(sizeof(void*)) + for poff in pointers: + assert ((bobj + poff))[0] == NULL + assert off_memo in pointers + pyobjects = pointers[:] + pyobjects.remove(off_memo) + trace('pyobjects:\t', pyobjects) + + # 4) + pyobj.__init__(Null()) + + # 5) + cdef PyObject* bout = NULL + t.off_poutput_buffer = 0 + for poff in pyobjects: + x = ((bobj + poff))[0] + if x.ob_type == &PyBytes_Type: + if t.off_poutput_buffer == 0: + t.off_poutput_buffer = poff + else: + raise AssertionError("found several inside Pickler") + assert t.off_poutput_buffer != 0 + markbusy(t.off_poutput_buffer, sizeof(PyObject*)) + trace(".output_buffer:\t", t.off_poutput_buffer) + + # .output_len + .max_output_len + # dump something small and expected -> find out which field changes correspondingly + import io + output_len = None + max_output_len = None + for n in range(1,10): + f = io.BytesIO() + pyobj.__init__(f, 0) + o = (None,)*n + pyobj.dump(o) + p = f.getvalue() + phok = b'(' + b'N'*n + b't' # full trails with "p0\n." but "p0\n" is optional + assert p.startswith(phok), p + + # InspectWhilePickling observes obj while the pickling is going on: + # - sees which fields have changes + # - sees which fields are candidates for max_output_len + class InspectWhilePickling: + def __init__(self): + self.diff = None # what changes + self.doff2val = {} # off from .diff -> Py_ssize_t read from it + self.max_output_len = set() # offsets that are candidates for .max_output_len + + def __reduce__(self): + self.diff = obj_diff(sizeof(Py_ssize_t)) + for off in self.diff: + self.doff2val[off] = ((bobj + off))[0] + + cdef PyObject* output_buffer = \ + ((bobj + t.off_poutput_buffer))[0] + assert output_buffer.ob_type == &PyBytes_Type + off = sizeof(PyObject) + off = (off + sizeof(Py_ssize_t) - 1) & (~(sizeof(Py_ssize_t) - 1)) + assert off % sizeof(Py_ssize_t) == 0 + while off + sizeof(Py_ssize_t) <= t.size: + v = ((bobj + off))[0] + if v == PyBytes_GET_SIZE(output_buffer): + self.max_output_len.add(off) + off += sizeof(Py_ssize_t) + + return (int, ()) # arbitrary + + pyobj.__init__(Null(), 0) + i = InspectWhilePickling() + o += (i,) + obj_copy() + pyobj.dump(o) + assert i.diff is not None + #trace('n%d diff: %r\toff2val: %r' % (n, i.diff, i.doff2val)) + #trace(' ', busy) + + noutput_len = set() + for off in i.diff: + if off not in busy: + if i.doff2val[off] == (len(phok)-1): # (NNNN without t yet + noutput_len.add(off) + assert len(noutput_len) >= 1, noutput_len + if output_len is None: + output_len = noutput_len + else: + output_len.intersection_update(noutput_len) + + nmax_output_len = set() + for off in i.max_output_len: + if off not in busy: + nmax_output_len.add(off) + assert len(nmax_output_len) >= 1, nmax_output_len + if max_output_len is None: + max_output_len = nmax_output_len + else: + max_output_len.intersection_update(nmax_output_len) + + if len(output_len) != 1: + raise AssertionError("cannot find .output_len") + if len(max_output_len) != 1: + raise AssertionError("cannot find .max_output_len") + + t.off_output_len = output_len.pop() + markbusy(t.off_output_len, sizeof(Py_ssize_t)) + trace(".output_len:\t", t.off_output_len) + + t.off_max_output_len = max_output_len.pop() + markbusy(t.off_max_output_len, sizeof(Py_ssize_t)) + trace(".max_output_len:\t", t.off_max_output_len) + + free(bobj2) + return t + + +# _find_Pickler_save determines address and calling convention of `save` C +# function associated with specified Pickler. +# +# Address and calling convention of `save` are needed to be able to patch it. +cdef SaveFunc _find_Pickler_save(pyPickler) except *: + cdef SaveFunc save + save.addr = __find_Pickler_save(pyPickler) + save.cconv = __detect_save_callconv(pyPickler, save.addr) + #fprintf(stderr, "save.addr: %p\n", save.addr) + #fprintf(stderr, "save.cconv: %s\n", callconv_str(save.cconv)) + return save + +cdef void* __find_Pickler_save(pyPickler) except NULL: + assert isinstance(pyPickler, type) + + # start from _pickle_Pickler_dump as root and analyze how called functions + # behave wrt pickling deep chain of objects. We know whether a callee leads + # to save if, upon receiving control in our __reduce__, we see that the + # callee was entered and did not exited yet. If we find such a callee, we + # recourse the process and start to analyze functions that the callee invokes + # itself. We detect reaching save when we see that a callee was entered + # many times recursively. That happens because we feed deep recursive + # structure to the pickle, and because save itself is organized to invoke + # itself recursively - e.g. (obj,) is pickled via save -> save_tuple -> save. + cdef _XPyTypeObject* Pickler = <_XPyTypeObject*>(pyPickler) + cdef PyMethodDef* mdump = tp_methods_lookup(Pickler.tp_methods, 'dump') + #print("%s _pickle_Pickler_dump:" % pyPickler) + addr = mdump.ml_meth # = _pickle_Pickler_dump + while 1: + vcallee = cfunc_direct_callees(addr) + ok = False + for i in range(vcallee.size()): + callee = vcallee[i] + #fprintf(stderr, "checking %p ...\n", callee) + nentry = _nentry_on_deep_save(pyPickler, callee) + #fprintf(stderr, "%p - %ld\n", callee, nentry) + assert nentry in (0, 1) or nentry > 5, nentry + if nentry > 5: + return callee # found save + if nentry == 1: + addr = callee # found path that will lead to save + ok = True + break + + if not ok: + raise AssertionError('cannot find path leading to save') + +# _nentry_on_deep_save tests how addr is related to `save` via inspecting +# addr entry count when Pickler is feed deep recursive structure. +# +# if #entry is 0 - addr is unrelated to save +# if #entry is 1 - addr is related to save and calls it +# if #entry is big - addr is save +cdef long _nentry_on_deep_save(pyPickler, void* addr) except -1: # -> nentry + # below we rely on inside_counted which alters return address during the + # call to wrapped func. In practice this does not create problems on x86_64 + # and arm64, but on i386 there are many calls to functions like + # x86.get_pc_thunk.ax which are used to implement PC-relative addressing. + # If we let inside_counted to hook such a func it will result in a crash + # because returned address will be different from real PC of the caller. + # Try to protect us from entering into such situation by detecting leaf + # functions and not hooking them. For the reference x86.get_pc_thunk.ax is: + # + # movl (%esp), %eax + # ret + vcallee = cfunc_direct_callees(addr) + if vcallee.size() == 0: + return 0 + + # InspectWhilePickling observes how many times currently considered + # function was entered at the point of deep recursion inside save. + class InspectWhilePickling: + def __init__(self): + self.inside_counter = None + def __reduce__(self): + self.inside_counter = inside_counter + return (int, ()) # arbitrary + + class Null: + def write(self, data): pass + + i = InspectWhilePickling() + obj = (i,) + for _ in range(20): + obj = (obj,) + + p = pyPickler(Null(), 0) + + h = xfunchook_create() + global inside_counted_func + inside_counted_func = addr + xfunchook_prepare(h, &inside_counted_func, inside_counted) + xfunchook_install(h, 0) + p.dump(obj) + xfunchook_uninstall(h, 0) + xfunchook_destroy(h) + + assert i.inside_counter is not None + return i.inside_counter + + +# inside_counted is used to patch a function to count how many times that +# function is entered/leaved. +cdef extern from * nogil: # see _golang_str_pickle.S for details + """ + extern "C" { + extern void inside_counted(); + extern void* inside_counted_func; + extern long inside_counter; + } + """ + void inside_counted() + void* inside_counted_func + long inside_counter + + +# __detect_save_callconv determines calling convention that compiler used for save. +# +# On architectures with many registers - e.g. x86_64 and arm64 - the calling +# convention is usually the same as default, but on e.g. i386 - where the +# default cdecl means to put arguments on the stack, the compiler usually +# changes calling convention to use registers instead. +cdef Callconv __detect_save_callconv(pyPickler, void* save) except *: + for p in saveprobe_test_ccv: + #print("save: probing %s" % callconv_str(p.cconv)) + good = __save_probe1(pyPickler, save, p.addr) + #print(" ->", good) + if good: + return p.cconv + bad = "cannot determine save calling convention\n\n" + bad += "probed:\n" + for p in saveprobe_test_ccv: + bad += " - %s\t; callee_stkcleanup: %d\n" % (callconv_str(p.cconv), cfunc_is_callee_cleanup(p.addr)) + bad += "\n" + bad += "save callee_stkcleanup: %d\n" % cfunc_is_callee_cleanup(save) + bad += "save disassembly:\n%s" % cfunc_disasm(save) + raise AssertionError(bad) + +cdef bint __save_probe1(pyPickler, void* save, void* cfunc) except *: + # first see whether stack is cleaned up by caller or callee and how much. + # we need to do this first to avoid segfault if we patch save with cfunc + # with different stack cleanup as the probe. + save_stkclean = cfunc_is_callee_cleanup(save) + cfunc_stkclean = cfunc_is_callee_cleanup(cfunc) + if save_stkclean != cfunc_stkclean: + return False + + # now when we know that save and cfunc have the same stack cleanup protocol, we can start probing + global saveprobe_ncall, saveprobe_self, saveprobe_obj, saveprobe_pers_save + saveprobe_ncall = 0 + saveprobe_self = NULL + saveprobe_obj = NULL + saveprobe_pers_save = 0xdeafbeaf + + class Null: + def write(self, data): pass + p = pyPickler(Null(), 0) + obj = object() + + h = xfunchook_create() + xfunchook_prepare(h, &save, cfunc) + xfunchook_install(h, 0) + p.dump(obj) + xfunchook_uninstall(h, 0) + xfunchook_destroy(h) + + assert saveprobe_ncall == 1, saveprobe_ncall + good = (saveprobe_self == p and \ + saveprobe_obj == obj and \ + saveprobe_pers_save == 0) + return good + +cdef extern from * nogil: + r""" + static int saveprobe_ncall; + static void* saveprobe_self; + static void* saveprobe_obj; + static int saveprobe_pers_save; + + static int saveprobe(void* self, PyObject* obj, int pers_save) { + saveprobe_ncall++; + saveprobe_self = self; + saveprobe_obj = obj; + saveprobe_pers_save = pers_save; + return 0; // do nothing + } + + #define DEF_SAVEPROBE_builtin(ccname, callconv) \ + static int callconv \ + saveprobe_##ccname(void* self, PyObject* obj, int pers_save) { \ + return saveprobe(self, obj, pers_save); \ + } + #define DEF_SAVEPROBE_custom(ccname, _) \ + extern "C" char saveprobe_##ccname; + #define DEF_SAVEPROBE(ccname, callconv, cckind) DEF_SAVEPROBE_##cckind(ccname, callconv) + FOR_EACH_CALLCONV(DEF_SAVEPROBE) + + static std::vector saveprobe_test_ccv = { + #define CC_SAVEPROBE(ccname, _, __) \ + SaveFunc{(void*)&saveprobe_##ccname, CALLCONV_##ccname}, + FOR_EACH_CALLCONV(CC_SAVEPROBE) + }; + + // proxy for asm routines to invoke saveprobe + #ifdef LIBGOLANG_ARCH_386 + extern "C" int CALLCONV(fastcall) + saveprobe_ifastcall(void* self, PyObject* obj, int pers_save) { \ + return saveprobe(self, obj, pers_save); \ + } + #endif + """ + int saveprobe_ncall + void* saveprobe_self + void* saveprobe_obj + int saveprobe_pers_save + + vector[SaveFunc] saveprobe_test_ccv + + +# XXX doc save_invoke ... +# XXX place +cdef extern from *: + r""" + #define CC_SAVE_DEFCALL1_builtin(ccname, callconv) + #define CC_SAVE_DEFCALL1_custom(ccname, _) \ + extern "C" int CALLCONV(fastcall) \ + save_invoke_as_##ccname(void* save, void* self, PyObject* obj, int pers_save); + #define CC_SAVE_DEFCALL1(ccname, callconv, cckind) CC_SAVE_DEFCALL1_##cckind(ccname, callconv) + FOR_EACH_CALLCONV(CC_SAVE_DEFCALL1) + + static int save_invoke(void* save, Callconv cconv, void* self, PyObject* obj, int pers_save) { + using namespace golang; + + switch(cconv) { + #define CC_SAVE_CALL1_builtin(ccname, callconv) \ + case CALLCONV_ ## ccname: \ + return ((int (callconv *)(void*, PyObject*, int))save) \ + (self, obj, pers_save); + #define CC_SAVE_CALL1_custom(ccname, _) \ + case CALLCONV_ ## ccname: \ + return save_invoke_as_##ccname(save, self, obj, pers_save); + #define CC_SAVE_CALL1(ccname, callconv, cckind) CC_SAVE_CALL1_##cckind(ccname, callconv) + FOR_EACH_CALLCONV(CC_SAVE_CALL1) + default: + panic("unreachable"); + } + } + """ + int save_invoke(void* save, Callconv cconv, void* self, PyObject* obj, int pers_save) except -1 + + +# - cfunc_direct_callees returns addresses of functions that cfunc calls directly. +# +# - cfunc_is_callee_cleanup determines whether cfunc does stack cleanup by +# itself and for how much. +# +# - cfunc_disassembly returns disassembly of cfunc. +# +# XXX dedup iterating instructions -> DisasmIter +cdef extern from "capstone/capstone.h" nogil: + r""" + #include + #include "golang/fmt.h" + + #if defined(LIBGOLANG_ARCH_amd64) + # define MY_ARCH CS_ARCH_X86 + # define MY_MODE CS_MODE_64 + #elif defined(LIBGOLANG_ARCH_386) + # define MY_ARCH CS_ARCH_X86 + # define MY_MODE CS_MODE_32 + #elif defined(LIBGOLANG_ARCH_arm64) + # define MY_ARCH CS_ARCH_ARM64 + # define MY_MODE CS_MODE_LITTLE_ENDIAN + #else + # error "unsupported architecture" + #endif + + static std::tuple _insn_getimm1(cs_arch arch, cs_insn* ins); + std::vector cfunc_direct_callees(void *cfunc) { + const bool debug = false; + + using namespace golang; + using std::tie; + using std::max; + + std::vector vcallee; + + csh h; + cs_insn* ins; + cs_err err; + + cs_arch arch = MY_ARCH; + err = cs_open(arch, MY_MODE, &h); + if (err) { + fprintf(stderr, "cs_open: %s\n", cs_strerror(err)); + panic(cs_strerror(err)); + } + + err = cs_option(h, CS_OPT_DETAIL, CS_OPT_ON); + if (err) { + fprintf(stderr, "cs_option: %s\n", cs_strerror(err)); + panic(cs_strerror(err)); + } + + ins = cs_malloc(h); + if (ins == nil) + panic("cs_malloc failed"); + + const byte* code = (const byte*)cfunc; + size_t size = 10*1024; // something sane and limited + uint64_t addr = (uint64_t)cfunc; + uint64_t maxjump = addr; + while (cs_disasm_iter(h, &code, &size, &addr, ins)) { + if (debug) + fprintf(stderr, "0x%" PRIx64 ":\t%s\t\t%s\n", ins->address, ins->mnemonic, ins->op_str); + + if (cs_insn_group(h, ins, CS_GRP_RET)) { + if (ins->address >= maxjump) + break; + continue; + } + + uint64_t imm1; + bool imm1ok; + tie(imm1, imm1ok) = _insn_getimm1(arch, ins); + + bool call = cs_insn_group(h, ins, CS_GRP_CALL); + bool jump = cs_insn_group(h, ins, CS_GRP_JUMP) && !call; // e.g. BL on arm64 is both jump and call + + if (jump && imm1ok) { + maxjump = max(maxjump, imm1); + continue; + } + + if (call && imm1ok) { + void* callee = (void*)imm1; + if (debug) + fprintf(stderr, " *** DIRECT CALL -> %p\n", callee); + if (!std::count(vcallee.begin(), vcallee.end(), callee)) + vcallee.push_back(callee); + } + } + + if (debug) + fprintf(stderr, "\n"); + + cs_free(ins, 1); + cs_close(&h); + return vcallee; + } + + // _insn_getimm1 checks whether instruction comes with the sole immediate operand and returns it. + static std::tuple _insn_getimm1(cs_arch arch, cs_insn* ins) { + using namespace golang; + using std::make_tuple; + + switch (arch) { + case CS_ARCH_X86: { + cs_x86* x86 = &(ins->detail->x86); + if (x86->op_count == 1) { + cs_x86_op* op = &(x86->operands[0]); + if (op->type == X86_OP_IMM) + return make_tuple(op->imm, true); + } + break; + } + + case CS_ARCH_ARM64: { + cs_arm64* arm64 = &(ins->detail->arm64); + if (arm64->op_count == 1) { + cs_arm64_op* op = &(arm64->operands[0]); + if (op->type == ARM64_OP_IMM) + return make_tuple(op->imm, true); + } + break; + } + + default: + panic("TODO"); + } + + return make_tuple(0, false); + } + + + int cfunc_is_callee_cleanup(void *cfunc) { + // only i386 might have callee-cleanup + // https://en.wikipedia.org/wiki/X86_calling_conventions#List_of_x86_calling_conventions + if (!(MY_ARCH == CS_ARCH_X86 && MY_MODE == CS_MODE_32)) + return 0; + + const bool debug = false; + + int stkclean_by_callee = 0; + using namespace golang; + + csh h; + cs_insn* ins; + cs_err err; + + err = cs_open(MY_ARCH, MY_MODE, &h); + if (err) { + fprintf(stderr, "cs_open: %s\n", cs_strerror(err)); + panic(cs_strerror(err)); + } + + err = cs_option(h, CS_OPT_DETAIL, CS_OPT_ON); + if (err) { + fprintf(stderr, "cs_option: %s\n", cs_strerror(err)); + panic(cs_strerror(err)); + } + + ins = cs_malloc(h); + if (ins == nil) + panic("cs_malloc failed"); + + const byte* code = (const byte*)cfunc; + size_t size = 10*1024; // something sane and limited + uint64_t addr = (uint64_t)cfunc; + while (cs_disasm_iter(h, &code, &size, &addr, ins)) { + if (debug) + fprintf(stderr, "0x%" PRIx64 ":\t%s\t\t%s\n", ins->address, ins->mnemonic, ins->op_str); + + if (!cs_insn_group(h, ins, CS_GRP_RET)) + continue; + + assert(ins->id == X86_INS_RET); + cs_x86* x86 = &(ins->detail->x86); + if (x86->op_count > 0) { + cs_x86_op* op = &(x86->operands[0]); + if (op->type == X86_OP_IMM) + stkclean_by_callee = op->imm; + } + + break; + } + + if (debug) + fprintf(stderr, " *** CLEANUP BY: %s (%d)\n", (stkclean_by_callee ? "callee" : "caller"), stkclean_by_callee); + + cs_free(ins, 1); + cs_close(&h); + return stkclean_by_callee; + } + + std::string cfunc_disasm(void *cfunc) { + using namespace golang; + string disasm; + + csh h; + cs_insn* ins; + cs_err err; + + err = cs_open(MY_ARCH, MY_MODE, &h); + if (err) { + fprintf(stderr, "cs_open: %s\n", cs_strerror(err)); + panic(cs_strerror(err)); + } + + err = cs_option(h, CS_OPT_DETAIL, CS_OPT_ON); + if (err) { + fprintf(stderr, "cs_option: %s\n", cs_strerror(err)); + panic(cs_strerror(err)); + } + + ins = cs_malloc(h); + if (ins == nil) + panic("cs_malloc failed"); + + const byte* code = (const byte*)cfunc; + size_t size = 10*1024; // something sane and limited + uint64_t addr = (uint64_t)cfunc; + while (cs_disasm_iter(h, &code, &size, &addr, ins)) { + disasm += fmt::sprintf("0x%" PRIx64 ":\t%s\t\t%s\n", ins->address, ins->mnemonic, ins->op_str); + + // FIXME also handle forward jump like cfunc_direct_callees does + // should be done automatically after DisasmIter dedup + if (cs_insn_group(h, ins, CS_GRP_RET)) + break; + } + + cs_free(ins, 1); + cs_close(&h); + + return disasm; + } + """ + vector[void*] cfunc_direct_callees(void* cfunc) + int cfunc_is_callee_cleanup(void* cfunc) + string cfunc_disasm(void* cfunc) + + +# _test_inside_counted depends on inside_counted and funchook, which we don't want to expose. +# -> include the test from here. Do the same for other low-level tests. +include '_golang_str_pickle_test.pyx' + + +# ---- misc ---- + +cdef PyMethodDef* tp_methods_lookup(PyMethodDef* methv, str name) except NULL: + m = &methv[0] + while m.ml_name != NULL: + if str(m.ml_name) == name: + return m + m += 1 + raise KeyError("method %s not found" % name) + +cdef PyMemberDef* tp_members_lookup(PyMemberDef* membv, str name) except NULL: + m = &membv[0] + while m.name != NULL: + if str(m.name) == name: + return m + m += 1 + raise KeyError("member %s not found" % name) diff --git a/golang/_golang_str_pickle_test.pyx b/golang/_golang_str_pickle_test.pyx new file mode 100644 index 0000000..62c9a2f --- /dev/null +++ b/golang/_golang_str_pickle_test.pyx @@ -0,0 +1,181 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2023 Nexedi SA and Contributors. +# Kirill Smelkov +# +# This program is free software: you can Use, Study, Modify and Redistribute +# it under the terms of the GNU General Public License version 3, or (at your +# option) any later version, as published by the Free Software Foundation. +# +# You can also Link and Combine this program with other software covered by +# the terms of any of the Free Software licenses or any of the Open Source +# Initiative approved licenses and Convey the resulting work. Corresponding +# source of such a combination shall include the source code for all other +# software used. +# +# This program is distributed WITHOUT ANY WARRANTY; without even the implied +# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# See COPYING file for full licensing terms. +# See https://www.nexedi.com/licensing for rationale and options. + +# test for inside_counted +def _test_inside_counted(): # -> outok + outok = '' + + outok += '\n\n\nBEFORE PATCH\n' + print('\n\n\nBEFORE PATCH') + tfunc(3) + + t0 = '' + for i in range(3,0-1,-1): + t0 += '> tfunc(%d)\tinside_counter: 0\n' % i + for i in range(0,3+1,+1): + t0 += '< tfunc(%d)\tinside_counter: 0\n' % i + outok += t0 + + outok += '\n\n\nPATCHED\n' + print('\n\n\nPATCHED') + _patch = xfunchook_create() + global inside_counted_func + inside_counted_func = &tfunc + xfunchook_prepare(_patch, &inside_counted_func, inside_counted) + xfunchook_install(_patch, 0) + + tfunc(12) + + stk_size = 8 # = STK_SIZE from _golang_str_pickle.S + for i in range(12,0-1,-1): + outok += '> tfunc(%d)\tinside_counter: %d\n' % (i, min(12-i+1, stk_size)) + for i in range(0,12+1,+1): + outok += '< tfunc(%d)\tinside_counter: %d\n' % (i, min(12-i+1, stk_size)) + + outok += '\n\n\nUNPATCHED\n' + print('\n\n\nUNPATCHED') + xfunchook_uninstall(_patch, 0) + tfunc(3) + outok += t0 + + return outok + +cdef void tfunc(int x): + print('> tfunc(%d)\tinside_counter: %d' % (x, inside_counter)) + if x > 0: + tfunc(x-1) + print('< tfunc(%d)\tinside_counter: %d' % (x, inside_counter)) + + +def _test_cfunc_is_callee_cleanup(): + for t in _cfunc_is_callee_cleanup_testv: + stkclean = cfunc_is_callee_cleanup(t.cfunc) + assert stkclean == t.stkclean_by_callee_ok, (t.cfunc_name, stkclean, t.stkclean_by_callee_ok) + +cdef extern from * nogil: + r""" + struct _Test_cfunc_is_callee_clenup { + const char* cfunc_name; + void* cfunc; + int stkclean_by_callee_ok; + }; + + #define CASE(func, stkclean_ok) \ + _Test_cfunc_is_callee_clenup{#func, (void*)func, stkclean_ok} + + #if defined(LIBGOLANG_ARCH_386) + int CALLCONV(cdecl) + tfunc_cdecl1(int x) { return x; } + int CALLCONV(cdecl) + tfunc_cdecl2(int x, int y) { return x; } + int CALLCONV(cdecl) + tfunc_cdecl3(int x, int y, int z) { return x; } + + int CALLCONV(stdcall) + tfunc_stdcall1(int x) { return x; } + int CALLCONV(stdcall) + tfunc_stdcall2(int x, int y) { return x; } + int CALLCONV(stdcall) + tfunc_stdcall3(int x, int y, int z) { return x; } + + int CALLCONV(fastcall) + tfunc_fastcall1(int x) { return x; } + int CALLCONV(fastcall) + tfunc_fastcall2(int x, int y) { return x; } + int CALLCONV(fastcall) + tfunc_fastcall3(int x, int y, int z) { return x; } + + #ifndef LIBGOLANG_CC_msc // see note about C3865 in FOR_EACH_CALLCONV + int CALLCONV(thiscall) + tfunc_thiscall1(int x) { return x; } + int CALLCONV(thiscall) + tfunc_thiscall2(int x, int y) { return x; } + int CALLCONV(thiscall) + tfunc_thiscall3(int x, int y, int z) { return x; } + #endif + + #ifndef LIBGOLANG_CC_msc // no regparm on MSCV + int CALLCONV(regparm(1)) + tfunc_regparm1_1(int x) { return x; } + int CALLCONV(regparm(1)) + tfunc_regparm1_2(int x, int y) { return x; } + int CALLCONV(regparm(1)) + tfunc_regparm1_3(int x, int y, int z) { return x; } + + int CALLCONV(regparm(2)) + tfunc_regparm2_1(int x) { return x; } + int CALLCONV(regparm(2)) + tfunc_regparm2_2(int x, int y) { return x; } + int CALLCONV(regparm(2)) + tfunc_regparm2_3(int x, int y, int z) { return x; } + + int CALLCONV(regparm(3)) + tfunc_regparm3_1(int x) { return x; } + int CALLCONV(regparm(3)) + tfunc_regparm3_2(int x, int y) { return x; } + int CALLCONV(regparm(3)) + tfunc_regparm3_3(int x, int y, int z) { return x; } + #endif + + static std::vector<_Test_cfunc_is_callee_clenup> _cfunc_is_callee_cleanup_testv = { + CASE(tfunc_cdecl1 , 0 * 4), + CASE(tfunc_cdecl2 , 0 * 4), + CASE(tfunc_cdecl3 , 0 * 4), + CASE(tfunc_stdcall1 , 1 * 4), + CASE(tfunc_stdcall2 , 2 * 4), + CASE(tfunc_stdcall3 , 3 * 4), + CASE(tfunc_fastcall1 , 0 * 4), + CASE(tfunc_fastcall2 , 0 * 4), + CASE(tfunc_fastcall3 , 1 * 4), + #ifndef LIBGOLANG_CC_msc + CASE(tfunc_thiscall1 , 0 * 4), + CASE(tfunc_thiscall2 , 1 * 4), + CASE(tfunc_thiscall3 , 2 * 4), + #endif + #ifndef LIBGOLANG_CC_msc + CASE(tfunc_regparm1_1 , 0 * 4), + CASE(tfunc_regparm1_2 , 0 * 4), + CASE(tfunc_regparm1_3 , 0 * 4), + CASE(tfunc_regparm2_1 , 0 * 4), + CASE(tfunc_regparm2_2 , 0 * 4), + CASE(tfunc_regparm2_3 , 0 * 4), + CASE(tfunc_regparm3_1 , 0 * 4), + CASE(tfunc_regparm3_2 , 0 * 4), + CASE(tfunc_regparm3_3 , 0 * 4), + #endif + }; + + #else + // only i386 has many calling conventions + int tfunc_default(int x, int y, int z) { return x; } + + static std::vector<_Test_cfunc_is_callee_clenup> _cfunc_is_callee_cleanup_testv = { + CASE(tfunc_default, 0), + }; + #endif + + #undef CASE + """ + struct _Test_cfunc_is_callee_clenup: + const char* cfunc_name + void* cfunc + int stkclean_by_callee_ok + + vector[_Test_cfunc_is_callee_clenup] _cfunc_is_callee_cleanup_testv diff --git a/golang/_strconv.pyx b/golang/_strconv.pyx index 8ffd6f5..3b1db0c 100644 --- a/golang/_strconv.pyx +++ b/golang/_strconv.pyx @@ -28,12 +28,11 @@ from golang cimport pyb, byte, rune from golang cimport _utf8_decode_rune, _xunichr from golang.unicode cimport utf8 -from cpython cimport PyObject +from cpython cimport PyObject, _PyBytes_Resize cdef extern from "Python.h": PyObject* PyBytes_FromStringAndSize(char*, Py_ssize_t) except NULL char* PyBytes_AS_STRING(PyObject*) - int _PyBytes_Resize(PyObject**, Py_ssize_t) except -1 void Py_DECREF(PyObject*) @@ -65,7 +64,7 @@ cdef bytes _quote(const byte[::1] s, char quote, bint* out_nonascii_escape): # - cdef byte c q[0] = quote; q += 1 while i < len(s): - c = s[i] + c = s[i] # XXX -> use raw pointer in the loop # fast path - ASCII only if c < 0x80: if c in (ord('\\'), quote): @@ -104,7 +103,8 @@ cdef bytes _quote(const byte[::1] s, char quote, bint* out_nonascii_escape): # - # slow path - full UTF-8 decoding + unicodedata else: - r, size = _utf8_decode_rune(s[i:]) + # XXX optimize non-ascii case + r, size = _utf8_decode_rune(s[i:]) # XXX -> raw pointer isize = i + size # decode error - just emit raw byte as escaped @@ -117,6 +117,9 @@ cdef bytes _quote(const byte[::1] s, char quote, bint* out_nonascii_escape): # - q += 4 # printable utf-8 characters go as is + # XXX ? use Py_UNICODE_ISPRINTABLE (py3, not available on py2) ? + # XXX ? and generate C table based on unicodedata for py2 ? + # XXX -> generate table based on unicodedata for both py2/py3 because Py_UNICODE_ISPRINTABLE is not exactly what matches strconv.IsPrint (i.e. cat starts from LNPS) elif _unicodedata_category(_xunichr(r))[0] in 'LNPS': # letters, numbers, punctuation, symbols for j in range(i, isize): q[0] = s[j] diff --git a/golang/fmt.h b/golang/fmt.h index f548f0b..7c33802 100644 --- a/golang/fmt.h +++ b/golang/fmt.h @@ -111,7 +111,7 @@ inline error errorf(const string& format, Argv... argv) { // `const char *` overloads just to catch format mistakes as // __attribute__(format) does not work with std::string. LIBGOLANG_API string sprintf(const char *format, ...) -#ifndef _MSC_VER +#ifndef LIBGOLANG_CC_msc __attribute__ ((format (printf, 1, 2))) #endif ; diff --git a/golang/golang_str_pickle_test.py b/golang/golang_str_pickle_test.py new file mode 100644 index 0000000..1bf1a7b --- /dev/null +++ b/golang/golang_str_pickle_test.py @@ -0,0 +1,512 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2022-2023 Nexedi SA and Contributors. +# Kirill Smelkov +# +# This program is free software: you can Use, Study, Modify and Redistribute +# it under the terms of the GNU General Public License version 3, or (at your +# option) any later version, as published by the Free Software Foundation. +# +# You can also Link and Combine this program with other software covered by +# the terms of any of the Free Software licenses or any of the Open Source +# Initiative approved licenses and Convey the resulting work. Corresponding +# source of such a combination shall include the source code for all other +# software used. +# +# This program is distributed WITHOUT ANY WARRANTY; without even the implied +# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# See COPYING file for full licensing terms. +# See https://www.nexedi.com/licensing for rationale and options. + +from __future__ import print_function, absolute_import + +from golang import b, u, bstr, ustr +from golang.golang_str_test import xbytes, x32, unicode +from golang._golang import _test_inside_counted, _test_cfunc_is_callee_cleanup +from gpython.gpython_test import is_gpython +from pytest import raises, fixture, mark +import sys, io, struct +import six + +# run all tests on all py/c pickle modules we aim to support +import pickle as stdPickle +if six.PY2: + import cPickle +else: + import _pickle as cPickle +from zodbpickle import slowpickle as zslowPickle +from zodbpickle import fastpickle as zfastPickle +from zodbpickle import pickle as zpickle +from zodbpickle import _pickle as _zpickle +import pickletools as stdpickletools +if six.PY2: + from zodbpickle import pickletools_2 as zpickletools +else: + from zodbpickle import pickletools_3 as zpickletools + + +# pickle is pytest fixture that yields all variants of pickle module. +@fixture(scope="function", params=[stdPickle, cPickle, + zslowPickle, zfastPickle, zpickle, _zpickle]) +def pickle(request): + yield request.param + +# pickletools is pytest fixture that yields all variants of pickletools module. +@fixture(scope="function", params=[stdpickletools, zpickletools]) +def pickletools(request): + yield request.param + +# pickle2tools returns pickletools module that corresponds to module pickle. +def pickle2tools(pickle): + if pickle in (stdPickle, cPickle): + return stdpickletools + else: + return zpickletools + +# @gpystr_only is marker to run a test only under gpython -X gpython.strings=bstr+ustr +is_gpystr = type(u'') is ustr +gpystr_only = mark.skipif(not is_gpystr, reason="gpystr-only test") + + +# ---- pickling/unpickling under gpystr ---- + +# verify that loading *STRING opcodes loads them as bstr on gpython by default. +# TODO or with encoding='bstr' under plain py +@gpystr_only +def test_string_pickle_load_STRING(pickle): + p_str = b"S'\\xd0\\xbc\\xd0\\xb8\\xd1\\x80\\xff'\n." # STRING 'мир\xff' + p_utf8 = b"S'"+xbytes('мир')+b"\\xff'\n." # STRING 'мир\xff' + p_sbins = b'U\x07\xd0\xbc\xd0\xb8\xd1\x80\xff.' # SHORT_BINSTRING 'мир\xff' + p_bins = b'T\x07\x00\x00\x00\xd0\xbc\xd0\xb8\xd1\x80\xff.' # BINSTRING 'мир\xff' + + p_bytes = xbytes('мир')+b'\xff' + + # check invokes f on all test pickles + def check(f): + f(p_str) + f(p_utf8) + f(p_sbins) + f(p_bins) + + # default -> bstr on both py2 and py3 + # TODO only this check is gpystr_only -> remove whole-func @gpystr_only + def _(p): + obj = xloads(pickle, p) + assert type(obj) is bstr + assert obj == p_bytes + check(_) + + # also test bstr inside tuple (for symmetry with save) + def _(p): + p_ = b'(' + p[:-1] + b't.' + tobj = xloads(pickle, p_) + assert type(tobj) is tuple + assert len(tobj) == 1 + obj = tobj[0] + assert type(obj) is bstr + assert obj == p_bytes + check(_) + + # pickle supports encoding=... only on py3 + if six.PY3: + # encoding='bstr' -> bstr + def _(p): + obj = xloads(pickle, p, encoding='bstr') + assert type(obj) is bstr + assert obj == p_bytes + check(_) + + # encoding='bytes' -> bytes + def _(p): + obj = xloads(pickle, p, encoding='bytes') + assert type(obj) is bytes + assert obj == p_bytes + check(_) + + # encoding='utf-8' -> UnicodeDecodeError + def _(p): + with raises(UnicodeDecodeError): + xloads(pickle, p, encoding='utf-8') + check(_) + + # encoding='utf-8', errors=... -> unicode + def _(p): + obj = xloads(pickle, p, encoding='utf-8', errors='backslashreplace') + assert type(obj) is unicode + assert obj == u'мир\\xff' + check(_) + + +# verify that saving bstr results in *STRING opcodes on gpython. +@gpystr_only +def test_strings_pickle_save_STRING(pickle): + s = s0 = b(xbytes('мир')+b'\xff') + assert type(s) is bstr + + p_utf8 = b"S'"+xbytes('мир')+b"\\xff'\n." # STRING 'мир\xff' + p_sbins = b'U\x07\xd0\xbc\xd0\xb8\xd1\x80\xff.' # SHORT_BINSTRING 'мир\xff' + p_bins = b'T\x07\x00\x00\x00\xd0\xbc\xd0\xb8\xd1\x80\xff.' # BINSTRING 'мир\xff' + + def dumps(proto): + return xdumps(pickle, s, proto) + + assert dumps(0) == p_utf8 + for proto in range(1, HIGHEST_PROTOCOL(pickle)+1): + assert dumps(proto) == p_sbins + + # BINSTRING + s += b'\x55'*0x100 + p_bins_ = p_bins[:2] + b'\x01' + p_bins[3:-1] + b'\x55'*0x100 + b'.' + for proto in range(1, HIGHEST_PROTOCOL(pickle)+1): + assert dumps(proto) == p_bins_ + + # also test bstr inside tuple to verify that what we patched is actually + # _pickle.save that is invoked from inside other save_X functions. + s = (s0,) + p_tutf8 = b'(' + p_utf8[:-1] + b't.' + p_tsbins = b'(' + p_sbins[:-1] + b't.' + assert dumps(0) == p_tutf8 + assert dumps(1) == p_tsbins + # don't test proto ≥ 2 because they start to use TUPLE1 instead of TUPLE + + +# verify that loading *UNICODE opcodes loads them as unicode/ustr. +# this is standard behaviour but we verify it since we patch pickle's strings processing. +# also verify save lightly for symmetry. +# NOTE not @gpystr_only +def test_string_pickle_loadsave_UNICODE(pickle): + # NOTE builtin pickle behaviour is to save unicode via 'surrogatepass' error handler + # this means that b'мир\xff' -> ustr/unicode -> save will emit *UNICODE with + # b'мир\xed\xb3\xbf' instead of b'мир\xff' as data. + p_uni = b'V\\u043c\\u0438\\u0440\\udcff\n.' # UNICODE 'мир\uDCFF' + p_binu = b'X\x09\x00\x00\x00\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbf.' # BINUNICODE NOTE ...edb3bf not ...ff + p_sbinu = b'\x8c\x09\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbf.' # SHORT_BINUNICODE + p_binu8 = b'\x8d\x09\x00\x00\x00\x00\x00\x00\x00\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbf.' # BINUNICODE8 + + u_obj = u'мир\uDCFF'; assert type(u_obj) is unicode + + # load: check invokes f on all test pickles that pickle should support + def check(f): + f(p_uni) + f(p_binu) + if HIGHEST_PROTOCOL(pickle) >= 4: + f(p_sbinu) + f(p_binu8) + + def _(p): + obj = xloads(pickle, p) + assert type(obj) is unicode + assert obj == u_obj + check(_) + + # save + def dumps(proto): + return xdumps(pickle, u_obj, proto) + assert dumps(0) == p_uni + assert dumps(1) == p_binu + assert dumps(2) == p_binu + if HIGHEST_PROTOCOL(pickle) >= 3: + assert dumps(3) == p_binu + if HIGHEST_PROTOCOL(pickle) >= 4: + assert dumps(4) == p_sbinu + + +# ---- pickling/unpickling generally without gpystr ---- + +# verify that bstr/ustr can be pickled/unpickled correctly on !gpystr. +# gpystr should also load ok what was pickled on !gpystr. +# for uniformity gpystr is also verified to save/load objects correctly. +# However the main gpystr tests are load/save tests for *STRING and *UNICODE above. +def test_strings_pickle_bstr_ustr(pickle): + bs = b(xbytes('мир')+b'\xff') + us = u(xbytes('май')+b'\xff') + + def diss(p): return xdiss(pickle2tools(pickle), p) + def dis(p): print(diss(p)) + + # assert_pickle verifies that pickling obj results in + # + # - dumps_ok_gpystr (when run under gpython with gpython.string=bstr+ustr), or + # - dumps_ok_stdstr (when run under plain python or gpython with gpython.strings=pystd) + # + # and that unpickling results back in obj. + # + # gpystr should also unpickle !gpystr pickle correctly. + assert HIGHEST_PROTOCOL(pickle) <= 5 + def assert_pickle(obj, proto, dumps_ok_gpystr, dumps_ok_stdstr): + if proto > HIGHEST_PROTOCOL(pickle): + with raises(ValueError): + xdumps(pickle, obj, proto) + return + p = xdumps(pickle, obj, proto) + if not is_gpystr: + assert p == dumps_ok_stdstr, diss(p) + dumps_okv = [dumps_ok_stdstr] + else: + assert p == dumps_ok_gpystr, diss(p) + dumps_okv = [dumps_ok_gpystr, dumps_ok_stdstr] + for p in dumps_okv: + #dis(p) + obj2 = xloads(pickle, p) + assert type(obj2) is type(obj) + assert obj2 == obj + + _ = assert_pickle + + _(bs, 0, xbytes("S'мир\\xff'\n."), # STRING + b"cgolang\nbstr\n(V\\u043c\\u0438\\u0440\\udcff\ntR.") # bstr(UNICODE) + + _(us, 0, b'V\\u043c\\u0430\\u0439\\udcff\n.', # UNICODE + b'cgolang\nustr\n(V\\u043c\\u0430\\u0439\\udcff\ntR.') # ustr(UNICODE) + + _(bs, 1, b'U\x07\xd0\xbc\xd0\xb8\xd1\x80\xff.', # SHORT_BINSTRING + b'cgolang\nbstr\n(X\x09\x00\x00\x00' # bstr(BINUNICODE) + b'\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbftR.') + + # NOTE BINUNICODE ...edb3bf not ...ff (see test_string_pickle_loadsave_UNICODE for details) + _(us, 1, b'X\x09\x00\x00\x00\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbf.', # BINUNICODE + b'cgolang\nustr\n(X\x09\x00\x00\x00' # bstr(BINUNICODE) + b'\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbftR.') + + _(bs, 2, b'U\x07\xd0\xbc\xd0\xb8\xd1\x80\xff.', # SHORT_BINSTRING + b'cgolang\nbstr\nX\x09\x00\x00\x00' # bstr(BINUNICODE) + b'\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbf\x85\x81.') + + _(us, 2, b'X\x09\x00\x00\x00\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbf.', # BINUNICODE + b'cgolang\nustr\nX\x09\x00\x00\x00' # ustr(BINUNICODE) + b'\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbf\x85\x81.') + + _(bs, 3, b'U\x07\xd0\xbc\xd0\xb8\xd1\x80\xff.', # SHORT_BINSTRING + b'cgolang\nbstr\nC\x07\xd0\xbc\xd0\xb8\xd1\x80\xff\x85\x81.') # bstr(SHORT_BINBYTES) + + _(us, 3, b'X\x09\x00\x00\x00\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbf.', # BINUNICODE + b'cgolang\nustr\nX\x09\x00\x00\x00' # ustr(BINUNICODE) + b'\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbf\x85\x81.') + + for p in (4,5): + _(bs, p, + b'U\x07\xd0\xbc\xd0\xb8\xd1\x80\xff.', # SHORT_BINSTRING + b'\x8c\x06golang\x8c\x04bstr\x93C\x07' # bstr(SHORT_BINBYTES) + b'\xd0\xbc\xd0\xb8\xd1\x80\xff\x85\x81.') + _(us, p, + b'\x8c\x09\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbf.', # SHORT_BINUNICODE + b'\x8c\x06golang\x8c\x04ustr\x93\x8c\x09' # ustr(SHORT_BINUNICODE) + b'\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbf\x85\x81.') + + +# ---- disassembly ---- + +# xdiss returns disassembly of a pickle as string. +def xdiss(pickletools, p): # -> str + out = six.StringIO() + pickletools.dis(p, out) + return out.getvalue() + +# verify that disassembling *STRING opcodes works with treating strings as UTF8b. +@gpystr_only +def test_string_pickle_dis_STRING(pickletools): + p_str = b"S'\\xd0\\xbc\\xd0\\xb8\\xd1\\x80'\n." # STRING 'мир' + p_sbins = b'U\x06\xd0\xbc\xd0\xb8\xd1\x80.' # SHORT_BINSTRING 'мир' + p_bins = b'T\x06\x00\x00\x00\xd0\xbc\xd0\xb8\xd1\x80.' # BINSTRING 'мир' + + bmir = x32("b('мир')", "'мир'") + + assert xdiss(pickletools, p_str) == """\ + 0: S STRING %s + 28: . STOP +highest protocol among opcodes = 0 +""" % bmir + + assert xdiss(pickletools, p_sbins) == """\ + 0: U SHORT_BINSTRING %s + 8: . STOP +highest protocol among opcodes = 1 +""" % bmir + + assert xdiss(pickletools, p_bins) == """\ + 0: T BINSTRING %s + 11: . STOP +highest protocol among opcodes = 1 +""" % bmir + + +# ---- loads and normalized dumps ---- + +# xloads loads pickle p via pickle.loads +# it also verifies that .load and Unpickler.load give the same result. +def xloads(pickle, p, **kw): + obj1 = _xpickle_attr(pickle, 'loads')(p, **kw) + obj2 = _xpickle_attr(pickle, 'load') (io.BytesIO(p), **kw) + obj3 = _xpickle_attr(pickle, 'Unpickler')(io.BytesIO(p), **kw).load() + assert type(obj2) is type(obj1) + assert type(obj3) is type(obj1) + assert obj1 == obj2 == obj3 + return obj1 + +# xdumps dumps obj via pickle.dumps +# it also verifies that .dump and Pickler.dump give the same. +# the pickle is returned in normalized form - see pickle_normalize for details. +def xdumps(pickle, obj, proto, **kw): + p1 = _xpickle_attr(pickle, 'dumps')(obj, proto, **kw) + f2 = io.BytesIO(); _xpickle_attr(pickle, 'dump')(obj, f2, proto, **kw) + p2 = f2.getvalue() + f3 = io.BytesIO(); _xpickle_attr(pickle, 'Pickler')(f3, proto, **kw).dump(obj) + p3 = f3.getvalue() + assert type(p1) is bytes + assert type(p2) is bytes + assert type(p3) is bytes + assert p1 == p2 == p3 + + # remove not interesting parts: PROTO / FRAME header and unused PUTs + if proto >= 2: + protover = PROTO(proto) + assert p1.startswith(protover) + return pickle_normalize(pickle2tools(pickle), p1) + +def _xpickle_attr(pickle, name): + # on py3 pickle.py tries to import from C _pickle to optimize by default + # -> verify py version if we are asked to test pickle.py + if six.PY3 and (pickle is stdPickle): + assert getattr(pickle, name) is getattr(cPickle, name) + name = '_'+name + return getattr(pickle, name) + +# pickle_normalize returns normalized version of pickle p. +# +# - PROTO and FRAME opcodes are removed from header, +# - unused PUT, BINPUT and MEMOIZE opcodes - those without corresponding GET are removed, +# - *PUT indices start from 0 (this unifies cPickle with pickle). +def pickle_normalize(pickletools, p): + def iter_pickle(p): # -> i(op, arg, pdata) + op_prev = None + arg_prev = None + pos_prev = None + for op, arg, pos in pickletools.genops(p): + if op_prev is not None: + pdata_prev = p[pos_prev:pos] + yield (op_prev, arg_prev, pdata_prev) + op_prev = op + arg_prev = arg + pos_prev = pos + if op_prev is not None: + yield (op_prev, arg_prev, p[pos_prev:]) + + memo_oldnew = {} # idx used in original pop/get -> new index | None if not get + idx = 0 + for op, arg, pdata in iter_pickle(p): + if 'PUT' in op.name: + memo_oldnew.setdefault(arg, None) + elif 'MEMOIZE' in op.name: + memo_oldnew.setdefault(len(memo_oldnew), None) + elif 'GET' in op.name: + if memo_oldnew.get(arg) is None: + memo_oldnew[arg] = idx + idx += 1 + + pout = b'' + memo_old = set() # idx used in original pop + for op, arg, pdata in iter_pickle(p): + if op.name in ('PROTO', 'FRAME'): + continue + if 'PUT' in op.name: + memo_old.add(arg) + newidx = memo_oldnew.get(arg) + if newidx is None: + continue + pdata = globals()[op.name](newidx) + if 'MEMOIZE' in op.name: + idx = len(memo_old) + memo_old.add(idx) + newidx = memo_oldnew.get(idx) + if newidx is None: + continue + if 'GET' in op.name: + newidx = memo_oldnew[arg] + assert newidx is not None + pdata = globals()[op.name](newidx) + pout += pdata + return pout + +P = struct.pack +def PROTO(version): return b'\x80' + P('= 2 + + def _(p, p_normok): + p_norm = pickle_normalize(pickletools, p) + assert p_norm == p_normok, diss(p_norm) + + _(b'.', b'.') + _(b'I1\n.', b'I1\n.') + _(PROTO(2)+b'I1\n.', b'I1\n.') + + putgetv = [(PUT,GET), (BINPUT, BINGET)] + if proto >= 4: + putgetv.append((LONG_BINPUT, LONG_BINGET)) + for (put,get) in putgetv: + _(b'(I1\n'+put(1) + b'I2\n'+put(2) +b't'+put(3)+b'0'+get(3)+put(4)+b'.', + b'(I1\nI2\nt'+put(0)+b'0'+get(0)+b'.') + + if proto >= 4: + _(FRAME(4)+b'I1\n.', b'I1\n.') + _(b'I1\n'+MEMOIZE+b'I2\n'+MEMOIZE+GET(0)+b'.', + b'I1\n'+MEMOIZE+b'I2\n'+GET(0)+b'.') + + +# ---- internals of patching ---- + +# being able to cPickle bstr as STRING depends on proper working of inside_counted function. +# Verify it with dedicated unit test. +def test_inside_counted(capsys): + outok = _test_inside_counted() + _ = capsys.readouterr() + if _.err: + print(_.err, file=sys.stderr) + assert _.out == outok + +def test_cfunc_is_callee_cleanup(): + _test_cfunc_is_callee_cleanup() + +# verify that what we patched - e.g. PyUnicode_Decode - stay unaffected when +# called outside of bstr/ustr context. +# NOTE this test complements test_strings_patched_transparently in golang_str_test.py +def test_pickle_strings_patched_transparently(): + # PyUnicode_Decode stays working and unaffected + b_ = xbytes("abc") + _ = b_.decode(); assert type(_) is unicode; assert _ == u"abc" + _ = b_.decode("utf8"); assert type(_) is unicode; assert _ == u"abc" + _ = b_.decode("ascii"); assert type(_) is unicode; assert _ == u"abc" + + b_ = xbytes("мир") + _ = b_.decode("utf8"); assert type(_) is unicode; assert _ == u"мир" + with raises(UnicodeDecodeError): + b_.decode("ascii") + + +# ---- misc ---- + +# HIGHEST_PROTOCOL returns highest protocol supported by pickle. +def HIGHEST_PROTOCOL(pickle): + if six.PY3 and pickle is cPickle: + pmax = stdPickle.HIGHEST_PROTOCOL # py3: _pickle has no .HIGHEST_PROTOCOL + elif six.PY3 and pickle is _zpickle: + pmax = zpickle.HIGHEST_PROTOCOL # ----//---- for _zpickle + else: + pmax = pickle.HIGHEST_PROTOCOL + assert pmax >= 2 + return pmax diff --git a/golang/golang_str_test.py b/golang/golang_str_test.py index 71c4cff..0692de7 100644 --- a/golang/golang_str_test.py +++ b/golang/golang_str_test.py @@ -146,9 +146,17 @@ def test_strings_basic(): _ = ustr(123); assert type(_) is ustr; assert _ == '123' _ = bstr([1,'β']); assert type(_) is bstr; assert _ == "[1, 'β']" _ = ustr([1,'β']); assert type(_) is ustr; assert _ == "[1, 'β']" - obj = object() - _ = bstr(obj); assert type(_) is bstr; assert _ == str(obj) # - _ = ustr(obj); assert type(_) is ustr; assert _ == str(obj) # + obj = object(); assert str(obj).startswith('", + "") + _ = bstr(ecls); assert type(_) is bstr; assert _ == str(ecls) + _ = ustr(ecls); assert type(_) is ustr; assert _ == str(ecls) + exc = RuntimeError('zzz'); assert str(exc) == 'zzz' + _ = bstr(exc); assert type(_) is bstr; assert _ == str(exc) + _ = ustr(exc); assert type(_) is ustr; assert _ == str(exc) + # when stringifying they also handle bytes/bytearray inside containers as UTF-8 strings _ = bstr([xunicode( 'β')]); assert type(_) is bstr; assert _ == "['β']" @@ -246,10 +254,12 @@ def test_strings_basic(): assert hash(bs) == hash("мир"); assert bs == "мир" # str/repr + def rb(x,y): return xb32(x, 'b'+y,y) + def ru(x,y): return xu32(x, y,'u'+y) _ = str(us); assert isinstance(_, str); assert _ == "мир" _ = str(bs); assert isinstance(_, str); assert _ == "мир" - _ = repr(us); assert isinstance(_, str); assert _ == "u('мир')" - _ = repr(bs); assert isinstance(_, str); assert _ == "b('мир')" + _ = repr(us); assert isinstance(_, str); assert _ == ru("u('мир')", "'мир'") + _ = repr(bs); assert isinstance(_, str); assert _ == rb("b('мир')", "'мир'") # str/repr of non-valid utf8 b_hik8 = xbytes ('привет ')+b(k8mir_bytes); assert type(b_hik8) is bstr @@ -259,11 +269,17 @@ def test_strings_basic(): _ = str(u_hik8); assert isinstance(_, str); assert _ == xbytes('привет ')+b'\xcd\xc9\xd2' _ = str(b_hik8); assert isinstance(_, str); assert _ == xbytes('привет ')+b'\xcd\xc9\xd2' - _ = repr(u_hik8); assert isinstance(_, str); assert _ == r"u(b'привет \xcd\xc9\xd2')" - _ = repr(b_hik8); assert isinstance(_, str); assert _ == r"b(b'привет \xcd\xc9\xd2')" + _ = repr(u_hik8); assert isinstance(_, str); assert _ == r"u(b'привет \xcd\xc9\xd2')" + # NOTE ^^^ same for u,3/2 + _ = repr(b_hik8); assert isinstance(_, str); assert _ == rb(r"b(b'привет \xcd\xc9\xd2')", + r"'привет \xcd\xc9\xd2'") # str/repr of quotes def _(text, breprok, ureprok): + assert breprok[:2] == "b("; assert breprok[-1] == ")" + assert ureprok[:2] == "u("; assert ureprok[-1] == ")" + breprok = rb(breprok, breprok[2:-1]) # b('...') or '...' if bytes patched + ureprok = ru(ureprok, ureprok[2:-1]) # u('...') or '...' if unicode patched bt = b(text); assert type(bt) is bstr ut = u(text); assert type(ut) is ustr _ = str(bt); assert isinstance(_, str); assert _ == text @@ -286,20 +302,26 @@ def _(text, breprok, ureprok): # verify that bstr/ustr are created with correct refcount. def test_strings_refcount(): + # buffer with string data - not bytes nor unicode so that when builting + # string types are patched no case where bytes is created from the same + # bytes, or unicode is created from the same unicode - only increasing + # refcount of original object. + data = bytearray([ord('a'), ord('b'), ord('c'), ord('4')]) + # first verify our logic on std type - obj = xbytes(u'abc'); assert type(obj) is bytes + obj = bytes(data); assert type(obj) is bytes gc.collect(); assert sys.getrefcount(obj) == 1+1 # +1 due to obj passed to getrefcount call # bstr - obj = b('abc'); assert type(obj) is bstr + obj = b(data); assert type(obj) is bstr gc.collect(); assert sys.getrefcount(obj) == 1+1 - obj = bstr('abc'); assert type(obj) is bstr + obj = bstr(data); assert type(obj) is bstr gc.collect(); assert sys.getrefcount(obj) == 1+1 # ustr - obj = u('abc'); assert type(obj) is ustr + obj = u(data); assert type(obj) is ustr gc.collect(); assert sys.getrefcount(obj) == 1+1 - obj = ustr('abc'); assert type(obj) is ustr + obj = ustr(data); assert type(obj) is ustr gc.collect(); assert sys.getrefcount(obj) == 1+1 @@ -326,26 +348,6 @@ def _(i): # returns m[i] as int assert _(5) == 0x80 -# verify that bstr/ustr can be pickled/unpickled correctly. -def test_strings_pickle(): - bs = b("мир") - us = u("май") - - #from pickletools import dis - for proto in range(0, pickle.HIGHEST_PROTOCOL+1): - p_bs = pickle.dumps(bs, proto) - #dis(p_bs) - bs_ = pickle.loads(p_bs) - assert type(bs_) is bstr - assert bs_ == bs - - p_us = pickle.dumps(us, proto) - #dis(p_us) - us_ = pickle.loads(p_us) - assert type(us_) is ustr - assert us_ == us - - # verify that ord on bstr/ustr works as expected. def test_strings_ord(): with raises(TypeError): ord(b('')) @@ -617,7 +619,8 @@ def test_strings_iter(): # iter( b/u/unicode ) -> iterate unicode characters # NOTE that iter(b) too yields unicode characters - not integers or bytes - bi = iter(bs) + #bi = iter(bs) # XXX temp disabled + bi = iter(us) ui = iter(us) ui_ = iter(u_) class XIter: @@ -1100,64 +1103,65 @@ def xfmt(fmt, args): # _bprintf parses %-format ourselves. Verify that parsing first # NOTE here all strings are plain ASCII. - def _(fmt, args): + def _(fmt, args, ok): fmt = '*str '+fmt - for l in range(len(fmt), -1, -1): - # [:len(fmt)] verifies original case - # [:l bstr/ustr even for ASCII string - _("*str a %s z", '*str \'"\x7f') - _("*str a %s z", 'β') - _("*str a %s z", ('β',)) + # NOTE *str to force str -> bstr/ustr even for ASCII string + _("*str a %s z", 123 , "*str a 123 z") + _("*str a %s z", '*str \'"\x7f' , "*str a *str '\"\x7f z") + _("*str a %s z", 'β' , "*str a β z") + _("*str a %s z", ('β',) , "*str a β z") _("*str a %s z", ['β'] , "*str a ['β'] z") - _("a %s π", 123) - _("a %s π", '*str \'"\x7f') - _("a %s π", 'β') - _("a %s π", ('β',)) + _("a %s π", 123 , "a 123 π") + _("a %s π", '*str \'"\x7f' , "a *str '\"\x7f π") + _("a %s π", 'β' , "a β π") + _("a %s π", ('β',) , "a β π") _("a %s π", ['β'] , "a ['β'] π") - _("α %s z", 123) - _("α %s z", '*str \'"\x7f') - _("α %s z", 'β') - _("α %s z", ('β',)) + _("α %s z", 123 , "α 123 z") + _("α %s z", '*str \'"\x7f' , "α *str '\"\x7f z") + _("α %s z", 'β' , "α β z") + _("α %s z", ('β',) , "α β z") _("α %s z", ['β'] , "α ['β'] z") - _("α %s π", 123) - _("α %s π", '*str \'"\x7f') - _("α %s π", 'β') - _("α %s π", ('β',)) - _("α %s π", ('β',)) - _("α %s %s π", ('β', 'γ')) - _("α %s %s %s π", ('β', 'γ', 'δ')) - _("α %s %s %s %s %s %s %s π", (1, 'β', 2, 'γ', 3, 'δ', 4)) - _("α %s π", []) - _("α %s π", ([],)) - _("α %s π", ((),)) - _("α %s π", set()) - _("α %s π", (set(),)) - _("α %s π", frozenset()) - _("α %s π", (frozenset(),)) - _("α %s π", ({},)) + _("α %s π", 123 , "α 123 π") + _("α %s π", '*str \'"\x7f' , "α *str '\"\x7f π") + _("α %s π", 'β' , "α β π") + _("α %s π", ('β',) , "α β π") + _("α %s π", ('β',) , "α β π") + _("α %s %s π", ('β', 'γ') , "α β γ π") + _("α %s %s %s π", ('β', 'γ', 'δ') , "α β γ δ π") + _("α %s %s %s %s %s %s %s π", (1, 'β', 2, 'γ', 3, 'δ', 4), + "α 1 β 2 γ 3 δ 4 π") + _("α %s π", [] , "α [] π") + _("α %s π", ([],) , "α [] π") + _("α %s π", ((),) , "α () π") + _("α %s π", set() , x32("α set() π", "α set([]) π")) + _("α %s π", (set(),) , x32("α set() π", "α set([]) π")) + _("α %s π", frozenset() , x32("α frozenset() π", "α frozenset([]) π")) + _("α %s π", (frozenset(),) , x32("α frozenset() π", "α frozenset([]) π")) + _("α %s π", ({},) , "α {} π") _("α %s π", ['β'] , "α ['β'] π") _("α %s π", (['β'],) , "α ['β'] π") _("α %s π", (('β',),) , "α ('β',) π") @@ -1279,7 +1285,8 @@ def _(fmt, args, *okv): # recursive frozenset l = hlist() f = frozenset({1, l}); l.append(f) - _('α %s π', (f,)) + _('α %s π', (f,) , *x32(("α frozenset({1, [frozenset(...)]}) π", "α frozenset({[frozenset(...)], 1}) π"), + ("α frozenset([1, [frozenset(...)]]) π", "α frozenset([[frozenset(...)], 1]) π"))) # recursive dict (via value) d = {1:'мир'}; d.update({2:d}) @@ -1296,15 +1303,15 @@ def _(fmt, args, *okv): class Cold: def __repr__(self): return "Cold()" def __str__(self): return u"Класс (old)" - _('α %s π', Cold()) - _('α %s π', (Cold(),)) + _('α %s π', Cold() , "α Класс (old) π") + _('α %s π', (Cold(),) , "α Класс (old) π") # new-style class with __str__ class Cnew(object): def __repr__(self): return "Cnew()" def __str__(self): return u"Класс (new)" - _('α %s π', Cnew()) - _('α %s π', (Cnew(),)) + _('α %s π', Cnew() , "α Класс (new) π") + _('α %s π', (Cnew(),) , "α Класс (new) π") # custom classes inheriting from set/list/tuple/dict/frozenset @@ -1334,7 +1341,10 @@ class D(dict): pass # namedtuple cc = collections; xcc = six.moves Point = cc.namedtuple('Point', ['x', 'y']) - _('α %s π', (Point('β','γ'),) , "α Point(x='β', y='γ') π") + verify_fmt_all_types(lambda fmt, args: fmt % args, + 'α %s π', Point('β','γ') , TypeError("not all arguments converted during string formatting"), excok=True) + _('α %s %s π',Point('β','γ') , "α β γ π") + _('α %s π', (Point('β','γ'),) , "α Point(x='β', y='γ') π") # deque _('α %s π', cc.deque(['β','γ']) , "α deque(['β', 'γ']) π") _('α %s π', (cc.deque(['β','γ']),) , "α deque(['β', 'γ']) π") @@ -1536,6 +1546,14 @@ def test_strings__format__(): # verify print for bstr/ustr. def test_strings_print(): outok = readfile(dir_testprog + "/golang_test_str.txt") + # repr(bstr|ustr) is changed if string types are patched: + # b('...') -> '...' if bstr is patched in + # u('...') -> u'...' if ustr is patched in (here we assume it is all valid utf8 there) + if bstr is bytes: + outok = re.sub(br"b\((.*?)\)", x32(r"b\1", r"\1"), outok) + if ustr is unicode: + outok = re.sub(br"u\((.*?)\)", x32(r"\1", r"u\1"), outok) + retcode, stdout, stderr = _pyrun(["golang_test_str.py"], cwd=dir_testprog, stdout=PIPE, stderr=PIPE) assert retcode == 0, (stdout, stderr) @@ -1578,7 +1596,11 @@ def checkop(s, meth, *argv, **kw): ur = xcall(us, meth, *argv, **kw) def assertDeepEQ(a, b, bstrtype): - assert not isinstance(a, (bstr, ustr)) + # `assert not isinstance(a, (bstr, ustr))` done carefully not to + # break when bytes/unicode are patched with bstr/ustr + if isinstance(a, bytes): assert type(a) is bytes + if isinstance(a, unicode): assert type(a) is unicode + if type(a) is unicode: assert type(b) is bstrtype assert a == b @@ -1841,6 +1863,26 @@ class MyStr(tx): _ = b(xx); assert type(_) is bstr ; assert _ == 'мир' _ = u(xx); assert type(_) is ustr ; assert _ == 'мир' + # __str__ returns *str, not MyStr + txstr = { + unicode: str, + bstr: x32(ustr, bstr), + ustr: x32(ustr, bstr), + }[tx] + if six.PY2 and tx is unicode: # on py2 unicode.__str__ raises UnicodeEncodeError: + aa = u'mir' # `'ascii' codec can't encode ...` -> do the test on ascii + _ = aa.__str__(); assert _ == 'mir' + else: + _ = xx.__str__(); assert _ == 'мир' + assert type(_) is txstr + + # for bstr/ustr __bytes__/__unicode__ return *str, never MyStr + # (builtin unicode has no __bytes__/__unicode__) + if tx is not unicode: + _ = xx.__bytes__(); assert type(_) is bstr; assert _ == 'мир' + _ = xx.__unicode__(); assert type(_) is ustr; assert _ == 'мир' + + # subclass with __str__ class MyStr(tx): def __str__(self): return u'αβγ' @@ -1864,6 +1906,17 @@ def __str__(self): with raises(TypeError): u(xx) +# verify that bstr/ustr has no extra attributes compared to str and UserString. +# (else e.g. IPython's guarded_eval.py fails when doing `_list_methods(collections.UserString, dir(str)`. +# XXX gpython-only ? +@mark.parametrize('tx', (bstr, ustr)) +def _test_strings_no_extra_methods(tx): # XXX reenable (str does not have __bytes__) + from six.moves import UserString + for attr in dir(tx): + assert hasattr(str, attr) + assert hasattr(UserString, attr) + + def test_qq(): # NOTE qq is also tested as part of strconv.quote @@ -2417,20 +2470,24 @@ def R(x): # verify that what we patched - e.g. bytes.__repr__ - stay unaffected when # called outside of bstr/ustr context. +# NOTE this test is complemented by test_pickle_strings_patched_transparently in golang_str_pickle_test.py def test_strings_patched_transparently(): b_ = xbytes ("мир"); assert type(b_) is bytes u_ = xunicode ("мир"); assert type(u_) is unicode ba_ = xbytearray("мир"); assert type(ba_) is bytearray # standard {repr,str}(bytes|unicode|bytearray) stay unaffected - assert repr(b_) == x32(r"b'\xd0\xbc\xd0\xb8\xd1\x80'", - r"'\xd0\xbc\xd0\xb8\xd1\x80'") - assert repr(u_) == x32(r"'мир'", - r"u'\u043c\u0438\u0440'") + assert repr(b_) == xB32(x32("b'мир'", "'мир'"), + r"b'\xd0\xbc\xd0\xb8\xd1\x80'", + r"'\xd0\xbc\xd0\xb8\xd1\x80'") + assert repr(u_) == xU32(x32("'мир'", "u'мир'"), + r"'мир'", + r"u'\u043c\u0438\u0440'") assert repr(ba_) == r"bytearray(b'\xd0\xbc\xd0\xb8\xd1\x80')" - assert str(b_) == x32(r"b'\xd0\xbc\xd0\xb8\xd1\x80'", - "\xd0\xbc\xd0\xb8\xd1\x80") + assert str(b_) == xS32("мир", + r"b'\xd0\xbc\xd0\xb8\xd1\x80'", + "\xd0\xbc\xd0\xb8\xd1\x80") if six.PY3 or sys.getdefaultencoding() == 'utf-8': # py3 or gpython/py2 assert str(u_) == "мир" else: @@ -2438,8 +2495,9 @@ def test_strings_patched_transparently(): with raises(UnicodeEncodeError): str(u_) # 'ascii' codec can't encode ... assert str(u'abc') == "abc" - assert str(ba_) == x32(r"bytearray(b'\xd0\xbc\xd0\xb8\xd1\x80')", - b'\xd0\xbc\xd0\xb8\xd1\x80') + assert str(ba_) == xS32("мир", + r"bytearray(b'\xd0\xbc\xd0\xb8\xd1\x80')", + b'\xd0\xbc\xd0\xb8\xd1\x80') # unicode comparison stay unaffected assert (u_ == u_) is True @@ -2458,9 +2516,10 @@ def test_strings_patched_transparently(): assert (u_ >= u2) is True ; assert (u2 >= u_) is False # bytearray.__init__ stay unaffected - with raises(TypeError): bytearray(u'мир') - a = bytearray() - with raises(TypeError): a.__init__(u'мир') + if ustr is not unicode: + with raises(TypeError): bytearray(u'мир') + a = bytearray() + with raises(TypeError): a.__init__(u'мир') def _(*argv): a = bytearray(*argv) @@ -2530,9 +2589,29 @@ def bench_bencode(b): # xbytes/xunicode/xbytearray convert provided bytes/unicode object to bytes, # unicode or bytearray correspondingly to function name. -def xbytes(x): return x.encode('utf-8') if type(x) is unicode else x -def xunicode(x): return x.decode('utf-8') if type(x) is bytes else x -def xbytearray(x): return bytearray(xbytes(x)) +def xbytes(x): + assert isinstance(x, (bytes,unicode)) + if isinstance(x, unicode): + x = x.encode('utf-8') + assert isinstance(x, bytes) + x = _bdata(x) + assert type(x) is bytes + return x + +def xunicode(x): + assert isinstance(x, (bytes,unicode)) + if isinstance(x, bytes): + x = x.decode('utf-8') + assert isinstance(x, unicode) + x = _udata(x) + assert type(x) is unicode + return x + +def xbytearray(x): + assert isinstance(x, (bytes,unicode)) + x = bytearray(xbytes(x)) + assert type(x) is bytearray + return x # deepReplaceStr2Bytearray replaces str to bytearray, or hashable-version of # bytearray, if str objects are detected to be present inside set or dict keys. @@ -2625,3 +2704,29 @@ def __hash__(self): # x32(a,b) returns a on py3, or b on py2 def x32(a, b): return a if six.PY3 else b + +# xb32(x, y, z) returns x if (bstr is not bytes) or x32(y,z) +# xu32(x, y, z) returns x if (ustr is not unicode) or x32(y,z) +def xb32(x, y, z): + return x if (bstr is not bytes) else x32(y,z) +def xu32(x, y, z): + return x if (ustr is not unicode) else x32(y,z) + +# xB32(x, y, z) returns x if (bstr is bytes) or x32(y,z) +# xU32(x, y, z) returns x if (ustr is unicode) or x32(y,z) +# xS32(x, y, z) returns x if (str is bstr|ustr) or x32(y,z) +# XXX replace usage of xB32 to directly via xB ? +def xB32(x, y, z): return xB(x, x32(y,z)) +def xU32(x, y, z): return xU(x, x32(y,z)) +def xS32(x, y, z): return xS(x, x32(y,z)) + + +# xB(x, y) returns x if (bstr is bytes) or y +# xU(x, y) returns x if (ustr is unicode) or y +# xS(x, y) returns x if (str is bstr|ustr) or y +def xB(x, y): + return x if (bstr is bytes) else y +def xU(x, y): + return x if (ustr is unicode) else y +def xS(x, y): + return x if (str is bstr or str is ustr) else y diff --git a/golang/libgolang.h b/golang/libgolang.h index 0d4c153..53a8aec 100644 --- a/golang/libgolang.h +++ b/golang/libgolang.h @@ -169,6 +169,8 @@ // [1] Libtask: a Coroutine Library for C and Unix. https://swtch.com/libtask. // [2] http://9p.io/magic/man2html/2/thread. +#include "golang/runtime/platform.h" + #include #include #include @@ -177,21 +179,18 @@ #include #include -#ifdef _MSC_VER // no mode_t on msvc +#ifdef LIBGOLANG_CC_msc // no mode_t on msvc typedef int mode_t; #endif // DSO symbols visibility (based on https://gcc.gnu.org/wiki/Visibility) -#if defined _WIN32 || defined __CYGWIN__ +#ifdef LIBGOLANG_OS_windows #define LIBGOLANG_DSO_EXPORT __declspec(dllexport) #define LIBGOLANG_DSO_IMPORT __declspec(dllimport) -#elif __GNUC__ >= 4 +#else #define LIBGOLANG_DSO_EXPORT __attribute__ ((visibility ("default"))) #define LIBGOLANG_DSO_IMPORT __attribute__ ((visibility ("default"))) -#else - #define LIBGOLANG_DSO_EXPORT - #define LIBGOLANG_DSO_IMPORT #endif #if BUILDING_LIBGOLANG diff --git a/golang/os.cpp b/golang/os.cpp index bc37c64..a7c7f2a 100644 --- a/golang/os.cpp +++ b/golang/os.cpp @@ -38,7 +38,7 @@ // cut this short // (on darwing sys_siglist declaration is normally provided) // (on windows sys_siglist is not available at all) -#if !(defined(__APPLE__) || defined(_WIN32)) +#if !(defined(LIBGOLANG_OS_darwin) || defined(LIBGOLANG_OS_windows)) extern "C" { extern const char * const sys_siglist[]; } @@ -287,7 +287,7 @@ string Signal::String() const { const Signal& sig = *this; const char *sigstr = nil; -#ifdef _WIN32 +#ifdef LIBGOLANG_OS_windows switch (sig.signo) { case SIGABRT: return "Aborted"; case SIGBREAK: return "Break"; diff --git a/golang/os.h b/golang/os.h index 0082544..9ad0c99 100644 --- a/golang/os.h +++ b/golang/os.h @@ -96,7 +96,7 @@ class _File : public object { // Open opens file @path. LIBGOLANG_API std::tuple Open(const string &path, int flags = O_RDONLY, mode_t mode = -#if !defined(_MSC_VER) +#if !defined(LIBGOLANG_CC_msc) S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP | S_IWGRP | S_IXGRP | S_IROTH | S_IWOTH | S_IXOTH diff --git a/golang/os/signal.cpp b/golang/os/signal.cpp index 9787c8d..793e7a4 100644 --- a/golang/os/signal.cpp +++ b/golang/os/signal.cpp @@ -89,7 +89,7 @@ #include #include -#if defined(_WIN32) +#if defined(LIBGOLANG_OS_windows) # include #endif @@ -101,7 +101,7 @@ # define debugf(format, ...) do {} while (0) #endif -#if defined(_MSC_VER) +#ifdef LIBGOLANG_CC_msc # define HAVE_SIGACTION 0 #else # define HAVE_SIGACTION 1 @@ -194,7 +194,7 @@ void _init() { if (err != nil) panic("os::newFile(_wakerx"); _waketx = vfd[1]; -#ifndef _WIN32 +#ifndef LIBGOLANG_OS_windows if (sys::Fcntl(_waketx, F_SETFL, O_NONBLOCK) < 0) panic("fcntl(_waketx, O_NONBLOCK)"); // TODO +syserr #else diff --git a/golang/pyx/build.py b/golang/pyx/build.py index 1cb5f3f..3c15f22 100644 --- a/golang/pyx/build.py +++ b/golang/pyx/build.py @@ -35,7 +35,7 @@ # pygolang uses setuptools_dso.DSO to build libgolang; all extensions link to it. import setuptools_dso -import sys, pkgutil, platform, sysconfig +import os, sys, pkgutil, platform, sysconfig from os.path import dirname, join, exists from distutils.errors import DistutilsError @@ -68,7 +68,7 @@ def _findpkg(pkgname): # -> _PyPkg # build_ext amends setuptools_dso.build_ext to allow combining C and C++ # sources in one extension without hitting `error: invalid argument -# '-std=c++11' not allowed with 'C'`. +# '-std=c++11' not allowed with 'C'`. XXX + asm _dso_build_ext = setuptools_dso.build_ext class build_ext(_dso_build_ext): def build_extension(self, ext): @@ -108,12 +108,33 @@ def filter_out(argprefix): # do per-source adjustsment only in .spawn . spawn = self.compiler.spawn def xspawn(argv): + argv = argv[:] + c = False - for arg in argv: + S = False + for i,arg in enumerate(argv): if arg.startswith('/Tc'): - c = True - if c: - argv = argv[:] + if arg.endswith('.S'): + argv[i] = arg[3:] # /Tcabc.S -> abc.S + S = True + else: + c = True + + # change cl.exe -> clang-cl.exe for assembly files so that assembler dialect is the same everywhere + if S: + assert argv[0] == self.compiler.cc, (argv, self.compiler.cc) + argv[0] = self.compiler.clang_cl + + # clang-cl fails on *.S if also given /EH... -> remove /EH... + while 1: + for i in range(len(argv)): + if argv[i].startswith('/EH'): + del argv[i] + break + else: + break + + if c or S: for i in range(len(argv)): if argv[i] == '/std:c++20': argv[i] = '/std:c11' @@ -128,6 +149,22 @@ def xspawn(argv): self.compiler._compile = _compile self.compiler.spawn = spawn + def build_extensions(self): + # adjust .compiler to support assembly sources + cc = self.compiler + if '.S' not in cc.src_extensions: + cc.src_extensions.append('.S') + cc.language_map['.S'] = 'asm' + cc.language_order.append('asm') + # XXX refer to https://blog.mozilla.org/nfroyd/2019/04/25/an-unexpected-benefit-of-standardizing-on-clang-cl/ + if cc.compiler_type == 'msvc': + if not cc.initialized: + cc.initialize() + ccmod = sys.modules[cc.__module__] + cc.clang_cl = ccmod._find_exe('clang-cl.exe', cc._paths.split(os.pathsep)) + cc._c_extensions.append('.S') # MSVCCompiler thinks it is C, but xspawn handles .S specially + _dso_build_ext.build_extensions(self) + # setup should be used instead of setuptools.setup def setup(**kw): @@ -176,8 +213,8 @@ def _with_build_defaults(name, kw): # -> (pygo, kw') incv.insert(1, join(pygo, 'golang', '_compat', sysname)) kw['include_dirs'] = incv - # link with libgolang.so if it is not libgolang itself - if name != 'golang.runtime.libgolang': + # link with libgolang.so if it is not libgolang itself, or another internal DSO + if name not in ('golang.runtime.libgolang', 'golang.runtime.funchook'): dsov = kw.get('dsos', [])[:] dsov.insert(0, 'golang.runtime.libgolang') kw['dsos'] = dsov @@ -212,9 +249,11 @@ def _with_build_defaults(name, kw): # -> (pygo, kw') dependv = kw.get('depends', [])[:] dependv.extend(['%s/golang/%s' % (pygo, _) for _ in [ 'libgolang.h', + 'runtime.h', 'runtime/internal.h', 'runtime/internal/atomic.h', 'runtime/internal/syscall.h', + 'runtime/platform.h', 'context.h', 'cxx.h', 'errors.h', diff --git a/golang/runtime.cpp b/golang/runtime.cpp new file mode 100644 index 0000000..0fc63e6 --- /dev/null +++ b/golang/runtime.cpp @@ -0,0 +1,69 @@ +// Copyright (C) 2023 Nexedi SA and Contributors. +// Kirill Smelkov +// +// This program is free software: you can Use, Study, Modify and Redistribute +// it under the terms of the GNU General Public License version 3, or (at your +// option) any later version, as published by the Free Software Foundation. +// +// You can also Link and Combine this program with other software covered by +// the terms of any of the Free Software licenses or any of the Open Source +// Initiative approved licenses and Convey the resulting work. Corresponding +// source of such a combination shall include the source code for all other +// software used. +// +// This program is distributed WITHOUT ANY WARRANTY; without even the implied +// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// See COPYING file for full licensing terms. +// See https://www.nexedi.com/licensing for rationale and options. + +// Package runtime mirrors Go package runtime. +// See runtime.h for package overview. + +#include "golang/runtime.h" + + +// golang::runtime:: +namespace golang { +namespace runtime { + +const string ARCH = +#ifdef LIBGOLANG_ARCH_386 + "386" +#elif defined(LIBGOLANG_ARCH_amd64) + "amd64" +#elif defined(LIBGOLANG_ARCH_arm64) + "arm64" +#else +# error +#endif + ; + + +const string OS = +#ifdef LIBGOLANG_OS_linux + "linux" +#elif defined(LIBGOLANG_OS_darwin) + "darwin" +#elif defined(LIBGOLANG_OS_windows) + "windows" +#else +# error +#endif + ; + + +const string CC = +#ifdef LIBGOLANG_CC_gcc + "gcc" +#elif defined(LIBGOLANG_CC_clang) + "clang" +#elif defined(LIBGOLANG_CC_msc) + "msc" +#else +# error +#endif + ; + + +}} // golang::runtime:: diff --git a/golang/runtime.h b/golang/runtime.h new file mode 100644 index 0000000..60b5765 --- /dev/null +++ b/golang/runtime.h @@ -0,0 +1,50 @@ +#ifndef _NXD_LIBGOLANG_RUNTIME_H +#define _NXD_LIBGOLANG_RUNTIME_H + +// Copyright (C) 2023 Nexedi SA and Contributors. +// Kirill Smelkov +// +// This program is free software: you can Use, Study, Modify and Redistribute +// it under the terms of the GNU General Public License version 3, or (at your +// option) any later version, as published by the Free Software Foundation. +// +// You can also Link and Combine this program with other software covered by +// the terms of any of the Free Software licenses or any of the Open Source +// Initiative approved licenses and Convey the resulting work. Corresponding +// source of such a combination shall include the source code for all other +// software used. +// +// This program is distributed WITHOUT ANY WARRANTY; without even the implied +// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// See COPYING file for full licensing terms. +// See https://www.nexedi.com/licensing for rationale and options. + +// Package runtime mirrors Go package runtime. + +#include "golang/libgolang.h" + + +// golang::runtime:: +namespace golang { +namespace runtime { + +// ARCH indicates processor architecture, that is running the program. +// +// e.g. "386", "amd64", "arm64", ... +extern LIBGOLANG_API const string ARCH; + +// OS indicates operating system, that is running the program. +// +// e.g. "linux", "darwin", "windows", ... +extern LIBGOLANG_API const string OS; + +// CC indicates C/C++ compiler, that compiled the program. +// +// e.g. "gcc", "clang", "msc", ... +extern LIBGOLANG_API const string CC; + + +}} // golang::runtime:: + +#endif // _NXD_LIBGOLANG_RUNTIME_H diff --git a/golang/runtime/internal/atomic.cpp b/golang/runtime/internal/atomic.cpp index e3faa98..2669714 100644 --- a/golang/runtime/internal/atomic.cpp +++ b/golang/runtime/internal/atomic.cpp @@ -20,7 +20,7 @@ #include "golang/runtime/internal/atomic.h" #include "golang/libgolang.h" -#ifndef _WIN32 +#ifndef LIBGOLANG_OS_windows #include #endif @@ -44,7 +44,7 @@ static void _forkNewEpoch() { void _init() { // there is no fork on windows -#ifndef _WIN32 +#ifndef LIBGOLANG_OS_windows int e = pthread_atfork(/*prepare*/nil, /*inparent*/nil, /*inchild*/_forkNewEpoch); if (e != 0) panic("pthread_atfork failed"); diff --git a/golang/runtime/internal/syscall.cpp b/golang/runtime/internal/syscall.cpp index c998e17..4602c0a 100644 --- a/golang/runtime/internal/syscall.cpp +++ b/golang/runtime/internal/syscall.cpp @@ -58,9 +58,9 @@ string _Errno::Error() { char ebuf[128]; bool ok; -#if __APPLE__ +#ifdef LIBGOLANG_OS_darwin ok = (::strerror_r(-e.syserr, ebuf, sizeof(ebuf)) == 0); -#elif defined(_WIN32) +#elif defined(LIBGOLANG_OS_windows) ok = (::strerror_s(ebuf, sizeof(ebuf), -e.syserr) == 0); #else char *estr = ::strerror_r(-e.syserr, ebuf, sizeof(ebuf)); @@ -102,7 +102,7 @@ __Errno Close(int fd) { return err; } -#ifndef _WIN32 +#ifndef LIBGOLANG_OS_windows __Errno Fcntl(int fd, int cmd, int arg) { int save_errno = errno; int err = ::fcntl(fd, cmd, arg); @@ -124,7 +124,7 @@ __Errno Fstat(int fd, struct ::stat *out_st) { int Open(const char *path, int flags, mode_t mode) { int save_errno = errno; -#ifdef _WIN32 // default to open files in binary mode +#ifdef LIBGOLANG_OS_windows // default to open files in binary mode if ((flags & (_O_TEXT | _O_BINARY)) == 0) flags |= _O_BINARY; #endif @@ -141,9 +141,9 @@ __Errno Pipe(int vfd[2], int flags) { return -EINVAL; int save_errno = errno; int err; -#ifdef __linux__ +#ifdef LIBGOLANG_OS_linux err = ::pipe2(vfd, flags); -#elif defined(_WIN32) +#elif defined(LIBGOLANG_OS_windows) err = ::_pipe(vfd, 4096, flags | _O_BINARY); #else err = ::pipe(vfd); @@ -167,7 +167,7 @@ __Errno Pipe(int vfd[2], int flags) { return err; } -#ifndef _WIN32 +#ifndef LIBGOLANG_OS_windows __Errno Sigaction(int signo, const struct ::sigaction *act, struct ::sigaction *oldact) { int save_errno = errno; int err = ::sigaction(signo, act, oldact); diff --git a/golang/runtime/internal/syscall.h b/golang/runtime/internal/syscall.h index e44160b..4771a19 100644 --- a/golang/runtime/internal/syscall.h +++ b/golang/runtime/internal/syscall.h @@ -63,13 +63,13 @@ LIBGOLANG_API int/*n|err*/ Read(int fd, void *buf, size_t count); LIBGOLANG_API int/*n|err*/ Write(int fd, const void *buf, size_t count); LIBGOLANG_API __Errno Close(int fd); -#ifndef _WIN32 +#ifndef LIBGOLANG_OS_windows LIBGOLANG_API __Errno Fcntl(int fd, int cmd, int arg); #endif LIBGOLANG_API __Errno Fstat(int fd, struct ::stat *out_st); LIBGOLANG_API int/*fd|err*/ Open(const char *path, int flags, mode_t mode); LIBGOLANG_API __Errno Pipe(int vfd[2], int flags); -#ifndef _WIN32 +#ifndef LIBGOLANG_OS_windows LIBGOLANG_API __Errno Sigaction(int signo, const struct ::sigaction *act, struct ::sigaction *oldact); #endif typedef void (*sighandler_t)(int); diff --git a/golang/runtime/libgolang.cpp b/golang/runtime/libgolang.cpp index 96208f8..3714cc7 100644 --- a/golang/runtime/libgolang.cpp +++ b/golang/runtime/libgolang.cpp @@ -52,7 +52,7 @@ #include // MSVC does not support statement expressions and typeof // -> redo list_entry via C++ lambda. -#ifdef _MSC_VER +#ifdef LIBGOLANG_CC_msc # undef list_entry # define list_entry(ptr, type, member) [&]() { \ const decltype( ((type *)0)->member ) *__mptr = (ptr); \ diff --git a/golang/runtime/platform.h b/golang/runtime/platform.h new file mode 100644 index 0000000..8def7e7 --- /dev/null +++ b/golang/runtime/platform.h @@ -0,0 +1,65 @@ +#ifndef _NXD_LIBGOLANG_RUNTIME_PLATFORM_H +#define _NXD_LIBGOLANG_RUNTIME_PLATFORM_H + +// Copyright (C) 2023 Nexedi SA and Contributors. +// Kirill Smelkov +// +// This program is free software: you can Use, Study, Modify and Redistribute +// it under the terms of the GNU General Public License version 3, or (at your +// option) any later version, as published by the Free Software Foundation. +// +// You can also Link and Combine this program with other software covered by +// the terms of any of the Free Software licenses or any of the Open Source +// Initiative approved licenses and Convey the resulting work. Corresponding +// source of such a combination shall include the source code for all other +// software used. +// +// This program is distributed WITHOUT ANY WARRANTY; without even the implied +// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// See COPYING file for full licensing terms. +// See https://www.nexedi.com/licensing for rationale and options. + +// Header platform.h provides preprocessor defines that describe target platform. + +// LIBGOLANG_ARCH_ is defined on architecture X. +// +// List of supported architectures: 386, amd64, arm64. +#if defined(__i386__) || defined(_M_IX86) +# define LIBGOLANG_ARCH_386 1 +#elif defined(__x86_64__) || defined(_M_X64) +# define LIBGOLANG_ARCH_amd64 1 +#elif defined(__aarch64__) || defined(_M_ARM64) +# define LIBGOLANG_ARCH_arm64 1 +#else +# error "unsupported architecture" +#endif + +// LIBGOLANG_OS_ is defined on operating system X. +// +// List of supported operating systems: linux, darwin, windows. +#ifdef __linux__ +# define LIBGOLANG_OS_linux 1 +#elif defined(__APPLE__) +# define LIBGOLANG_OS_darwin 1 +#elif defined(_WIN32) || defined(__CYGWIN__) +# define LIBGOLANG_OS_windows 1 +#else +# error "unsupported operating system" +#endif + +// LIBGOLANG_CC_ is defined on C/C++ compiler X. +// +// List of supported compilers: gcc, clang, msc. +#ifdef __clang__ +# define LIBGOLANG_CC_clang 1 +#elif defined(_MSC_VER) +# define LIBGOLANG_CC_msc 1 +// NOTE gcc comes last because e.g. clang and icc define __GNUC__ as well +#elif __GNUC__ +# define LIBGOLANG_CC_gcc 1 +#else +# error "unsupported compiler" +#endif + +#endif // _NXD_LIBGOLANG_RUNTIME_PLATFORM_H diff --git a/gpython/.gitignore b/gpython/.gitignore new file mode 100644 index 0000000..ab6c7de --- /dev/null +++ b/gpython/.gitignore @@ -0,0 +1 @@ +/_gpython.cpp diff --git a/gpython/__init__.py b/gpython/__init__.py index c6be786..14b2fce 100755 --- a/gpython/__init__.py +++ b/gpython/__init__.py @@ -25,10 +25,14 @@ - gevent is pre-activated and stdlib is patched to be gevent aware; - go, chan, select etc are put into builtin namespace; -- default string encoding is always set to UTF-8. +- default string encoding is always set to UTF-8; +- bstr/ustr replace builtin str/unicode types. Gevent activation can be disabled via `-X gpython.runtime=threads`, or $GPYTHON_RUNTIME=threads. + +String types replacement can be disabled via `-X gpython.strings=pystd`, or +$GPYTHON_STRINGS=pystd. """ # NOTE gpython is kept out of golang/ , since even just importing e.g. golang.cmd.gpython, @@ -230,9 +234,13 @@ def run(mmain): gevent = sys.modules.get('gevent', None) gpyver = 'GPython %s' % golang.__version__ if gevent is not None: - gpyver += ' [gevent %s]' % gevent.__version__ + gpyver += ' [runtime gevent %s]' % gevent.__version__ + else: + gpyver += ' [runtime threads]' + if type(u'') is golang.ustr: + gpyver += ' [strings bstr+ustr]' else: - gpyver += ' [threads]' + gpyver += ' [strings pystd]' ver.append(gpyver) import platform @@ -344,6 +352,9 @@ def main(): # imported first, e.g. to support sys.modules. import sys + # import pyx/c part of gpython + from gpython import _gpython + # safety check that we are not running from a setuptools entrypoint, where # it would be too late to monkey-patch stdlib. # @@ -372,6 +383,7 @@ def main(): reload(sys) sys.setdefaultencoding('utf-8') delattr(sys, 'setdefaultencoding') + _gpython.set_utf8_as_default_src_encoding() # import os to get access to environment. @@ -381,10 +393,12 @@ def main(): import os # extract and process `-X gpython.*` - # -X gpython.runtime=(gevent|threads) + $GPYTHON_RUNTIME + # -X gpython.runtime=(gevent|threads) + $GPYTHON_RUNTIME + # -X gpython.strings=(bstr+ustr|pystd) + $GPYTHON_STRINGS sys._xoptions = getattr(sys, '_xoptions', {}) argv_ = [] gpy_runtime = os.getenv('GPYTHON_RUNTIME', 'gevent') + gpy_strings = os.getenv('GPYTHON_STRINGS', 'bstr+ustr') igetopt = _IGetOpt(sys.argv[1:], _pyopt, _pyopt_long) for (opt, arg) in igetopt: if opt == '-X': @@ -393,6 +407,10 @@ def main(): gpy_runtime = arg[len('gpython.runtime='):] sys._xoptions['gpython.runtime'] = gpy_runtime + elif arg.startswith('gpython.strings='): + gpy_strings = arg[len('gpython.strings='):] + sys._xoptions['gpython.strings'] = gpy_strings + else: raise RuntimeError('gpython: unknown -X option %s' % arg) @@ -412,13 +430,15 @@ def main(): # sys.executable spawned from under `gpython -X gpython.runtime=threads` # also uses "threads" runtime by default. os.environ['GPYTHON_RUNTIME'] = gpy_runtime + os.environ['GPYTHON_STRINGS'] = gpy_strings - # init initializes according to selected runtime + # init initializes according to selected runtime and strings # it is called after options are parsed and sys.path is setup correspondingly. # this way golang and gevent are imported from exactly the same place as # they would be in standard python after regular import (ex from golang/ # under cwd if run under `python -c ...` or interactive console. def init(): + gpy_runtime_ver = gpy_runtime if gpy_runtime == 'gevent': # make gevent pre-available & stdlib patched import gevent @@ -434,22 +454,30 @@ def init(): if _ not in (True, None): # patched or nothing to do # XXX provide details raise RuntimeError('gevent monkey-patching failed') - gpy_verextra = 'gevent %s' % gevent.__version__ + gpy_runtime_ver += ' %s' % gevent.__version__ elif gpy_runtime == 'threads': - gpy_verextra = 'threads' - + pass else: - raise RuntimeError('gpython: invalid runtime %s' % gpy_runtime) + raise RuntimeError('gpython: invalid runtime %r' % gpy_runtime) - # put go, chan, select, ... into builtin namespace + if gpy_strings not in ('bstr+ustr', 'pystd'): + raise RuntimeError('gpython: invalid strings %r' % gpy_strings) + + # import golang + # this will activate selected runtime and strings + sys._gpy_runtime = gpy_runtime + sys._gpy_strings = gpy_strings import golang + + # put go, chan, select, ... into builtin namespace from six.moves import builtins for k in golang.__all__: setattr(builtins, k, getattr(golang, k)) + setattr(builtins, 'CCC', CCC) # sys.version - sys.version += (' [GPython %s] [%s]' % (golang.__version__, gpy_verextra)) + sys.version += (' [GPython %s] [runtime %s] [strings %s]' % (golang.__version__, gpy_runtime_ver, gpy_strings)) # tail to pymain pymain(argv, init) @@ -567,5 +595,11 @@ def __next__(self): next = __next__ # for py2 +# for tests XXX continue by first writing test XXX +1/0 +class _tEarlyStrSubclass(str): + pass + + if __name__ == '__main__': main() diff --git a/gpython/_gpython.pyx b/gpython/_gpython.pyx new file mode 100644 index 0000000..ada1df8 --- /dev/null +++ b/gpython/_gpython.pyx @@ -0,0 +1,31 @@ +# -*- coding: utf-8 -*- +# cython: language_level=2 +# Copyright (C) 2023 Nexedi SA and Contributors. +# Kirill Smelkov +# +# This program is free software: you can Use, Study, Modify and Redistribute +# it under the terms of the GNU General Public License version 3, or (at your +# option) any later version, as published by the Free Software Foundation. +# +# You can also Link and Combine this program with other software covered by +# the terms of any of the Free Software licenses or any of the Open Source +# Initiative approved licenses and Convey the resulting work. Corresponding +# source of such a combination shall include the source code for all other +# software used. +# +# This program is distributed WITHOUT ANY WARRANTY; without even the implied +# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# See COPYING file for full licensing terms. +# See https://www.nexedi.com/licensing for rationale and options. +"""_gpython.pyx ... XXX +""" + +cdef extern from *: + """ + void _set_utf8_as_default_src_encoding(); + """ + void _set_utf8_as_default_src_encoding() except * + +def set_utf8_as_default_src_encoding(): + _set_utf8_as_default_src_encoding() diff --git a/gpython/_gpython_c.cpp b/gpython/_gpython_c.cpp new file mode 100644 index 0000000..05ba977 --- /dev/null +++ b/gpython/_gpython_c.cpp @@ -0,0 +1,76 @@ +// Copyright (C) 2023 Nexedi SA and Contributors. +// Kirill Smelkov +// +// This program is free software: you can Use, Study, Modify and Redistribute +// it under the terms of the GNU General Public License version 3, or (at your +// option) any later version, as published by the Free Software Foundation. +// +// You can also Link and Combine this program with other software covered by +// the terms of any of the Free Software licenses or any of the Open Source +// Initiative approved licenses and Convey the resulting work. Corresponding +// source of such a combination shall include the source code for all other +// software used. +// +// This program is distributed WITHOUT ANY WARRANTY; without even the implied +// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +// See COPYING file for full licensing terms. +// See https://www.nexedi.com/licensing for rationale and options. + +// XXX doctitle + +#include +#if PY_MAJOR_VERSION < 3 +#include // mod_ty & co +#include // node +#include // encoding_decl & co +#include // PyAST_FromNode & co +#endif + +#include + +// py2: wrap PyAST_FromNode so that "utf-8" becomes the default encoding +#if PY_MAJOR_VERSION < 3 +static auto _py_PyAST_FromNode = &PyAST_FromNode; +static mod_ty gpy_PyAST_FromNode(const node* n, PyCompilerFlags* flags, + const char* filename, PyArena* arena) +{ +// fprintf(stderr, "gpy_PyAST_FromNode...\n"); + PyCompilerFlags gflags = {.cf_flags = 0}; + if (flags) + gflags = *flags; + if (TYPE(n) != encoding_decl) + gflags.cf_flags |= PyCF_SOURCE_IS_UTF8; + return _py_PyAST_FromNode(n, &gflags, filename, arena); +} + +static funchook_t* gpy_PyAST_FromNode_hook; +void _set_utf8_as_default_src_encoding() { + funchook_t *h; + int err; + +// funchook_set_debug_file("/dev/stderr"); + + gpy_PyAST_FromNode_hook = h = funchook_create(); + if (h == NULL) { + PyErr_NoMemory(); + return; + } + + err = funchook_prepare(h, (void**)&_py_PyAST_FromNode, (void*)gpy_PyAST_FromNode); + if (err != 0) { + PyErr_SetString(PyExc_RuntimeError, funchook_error_message(h)); + return; + } + + err = funchook_install(h, 0); + if (err != 0) { + PyErr_SetString(PyExc_RuntimeError, funchook_error_message(h)); + return; + } + + // ok +} +#else +void _set_utf8_as_default_src_encoding() {} +#endif diff --git a/gpython/gpython_test.py b/gpython/gpython_test.py index 420d0d4..355c2e7 100644 --- a/gpython/gpython_test.py +++ b/gpython/gpython_test.py @@ -47,20 +47,34 @@ def runtime(request): yield request.param +# strings is pytest fixture that yields all variants of should be supported gpython strings: +# '' - not specified (gpython should autoselect) +# 'bstr+ustr' +# 'pystd' +@pytest.fixture(scope="function", params=['', 'bstr+ustr', 'pystd']) +def strings(request): + yield request.param + # gpyenv returns environment appropriate for spawning gpython with -# specified runtime. -def gpyenv(runtime): # -> env +# specified runtime and strings. +def gpyenv(runtime, strings): # -> env env = os.environ.copy() if runtime != '': env['GPYTHON_RUNTIME'] = runtime else: env.pop('GPYTHON_RUNTIME', None) + if strings != '': + env['GPYTHON_STRINGS'] = strings + else: + env.pop('GPYTHON_STRINGS', None) return env @gpython_only def test_defaultencoding_utf8(): assert sys.getdefaultencoding() == 'utf-8' + assert eval("u'αβγ'") == u'αβγ' # FIXME fails on py2 which uses hardcoded default latin1 + # XXX +exec, +run file @gpython_only def test_golang_builtins(): @@ -143,19 +157,42 @@ def assert_gevent_not_activated(): @gpython_only -def test_executable(runtime): +def test_str_patched(): + # gpython, by default, patches str/unicode to be bstr/ustr. + # handling of various string modes is explicitly tested in test_Xstrings. + assert_str_patched() + +def assert_str_patched(): + #assert str.__name__ == ('bstr' if PY2 else 'ustr') + assert str.__name__ == 'str' + assert str is (bstr if PY2 else ustr) + if PY2: + assert unicode.__name__ == 'unicode' + assert unicode is ustr + assert type('') is str + assert type(b'') is (bstr if PY2 else bytes) + assert type(u'') is ustr + +def assert_str_not_patched(): + assert str.__name__ == 'str' + assert str is not bstr + assert str is not ustr + if PY2: + assert unicode.__name__ == 'unicode' + assert unicode is not bstr + assert unicode is not ustr + assert type('') is str + assert type(b'') is bytes + assert type(u'') is (unicode if PY2 else str) + + +@gpython_only +def test_executable(): # sys.executable must point to gpython and we must be able to execute it. - import gevent assert 'gpython' in sys.executable - ver = pyout(['-c', 'import sys; print(sys.version)'], env=gpyenv(runtime)) + ver = pyout(['-c', 'import sys; print(sys.version)'], env=gpyenv('', '')) ver = str(ver) assert ('[GPython %s]' % golang.__version__) in ver - if runtime != 'threads': - assert ('[gevent %s]' % gevent.__version__) in ver - assert ('[threads]') not in ver - else: - assert ('[gevent ') not in ver - assert ('[threads]') in ver # verify pymain. @@ -322,15 +359,20 @@ def check(argv): # pymain -V/--version # gpython_only because output differs from !gpython. @gpython_only -def test_pymain_ver(runtime): +def test_pymain_ver(runtime, strings): from golang import b from gpython import _version_info_str as V import gevent vok = 'GPython %s' % golang.__version__ if runtime != 'threads': - vok += ' [gevent %s]' % gevent.__version__ + vok += ' [runtime gevent %s]' % gevent.__version__ else: - vok += ' [threads]' + vok += ' [runtime threads]' + + if strings != 'pystd': + vok += ' [strings bstr+ustr]' + else: + vok += ' [strings pystd]' if is_cpython: vok += ' / CPython %s' % platform.python_version() @@ -341,10 +383,12 @@ def test_pymain_ver(runtime): vok += '\n' - ret, out, err = _pyrun(['-V'], stdout=PIPE, stderr=PIPE, env=gpyenv(runtime)) + env = gpyenv(runtime, strings) + + ret, out, err = _pyrun(['-V'], stdout=PIPE, stderr=PIPE, env=env) assert (ret, out, b(err)) == (0, b'', b(vok)) - ret, out, err = _pyrun(['--version'], stdout=PIPE, stderr=PIPE, env=gpyenv(runtime)) + ret, out, err = _pyrun(['--version'], stdout=PIPE, stderr=PIPE, env=env) assert (ret, out, b(err)) == (0, b'', b(vok)) # verify that ./bin/gpython runs ok. diff --git a/pyproject.toml b/pyproject.toml index c19bed0..07ecad3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,2 @@ [build-system] -requires = ["setuptools", "wheel", "setuptools_dso >= 2.7", "cython", "gevent"] +requires = ["setuptools", "wheel", "setuptools_dso >= 2.7", "cython < 3", "gevent"] diff --git a/setup.py b/setup.py index f3ef37f..f0539ee 100644 --- a/setup.py +++ b/setup.py @@ -42,9 +42,9 @@ def pygo_cy_builtin_type_name_get(self): from setuptools.command.develop import develop as _develop from distutils import sysconfig from os.path import dirname, join -import sys, os, re +import sys, os, re, platform, errno -# read file content +# read/write file content def readfile(path): # -> str with open(path, 'rb') as f: data = f.read() @@ -52,6 +52,20 @@ def readfile(path): # -> str data = data.decode('utf-8') return data +def writefile(path, data): + if not isinstance(data, bytes): + data = data.encode('utf-8') + with open(path, 'wb') as f: + f.write(data) + +# mkdir -p +def mkdir_p(path): + try: + os.makedirs(path) + except OSError as e: + if e.errno != errno.EEXIST: + raise + # reuse golang.pyx.build to build pygolang dso and extensions. # we have to be careful and inject synthetic golang package in order to be # able to import golang.pyx.build without built/working golang. @@ -59,6 +73,7 @@ def readfile(path): # -> str exec(readfile('trun'), trun) trun['ximport_empty_golangmod']() from golang.pyx.build import setup, DSO, Extension as Ext +from setuptools_dso import ProbeToolchain # grep searches text for pattern. @@ -104,7 +119,7 @@ class XInstallGPython: # (script_name, script) -> (script_name, script) def transform_script(self, script_name, script): # on windows setuptools installs 3 files: - # gpython-script.py + # gpython-script.py XXX do we need to adjust this similarly to pymain? # gpython.exe # gpython.exe.manifest # we want to override .py only. @@ -173,8 +188,8 @@ def install_egg_scripts(self, dist): # requirements of packages under "golang." namespace R = { - 'cmd.pybench': {'pytest'}, - 'pyx.build': {'setuptools', 'wheel', 'cython', 'setuptools_dso >= 2.7'}, + 'cmd.pybench': {'pytest', 'py'}, + 'pyx.build': {'setuptools', 'wheel', 'cython < 3', 'setuptools_dso >= 2.7'}, 'x.perf.benchlib': {'numpy'}, } # TODO generate `a.b -> a`, e.g. x.perf = join(x.perf.*); x = join(x.*) @@ -184,7 +199,8 @@ def install_egg_scripts(self, dist): R['all'] = Rall # ipython/pytest are required to test py2 integration patches -R['all_test'] = Rall.union(['ipython', 'pytest']) # pip does not like "+" in all+test +# zodbpickle is used to test pickle support for bstr/ustr +R['all_test'] = Rall.union(['ipython', 'pytest', 'zodbpickle']) # pip does not like "+" in all+test # extras_require <- R extras_require = {} @@ -200,6 +216,206 @@ def get_python_libdir(): else: return sysconfig.get_config_var('LIBDIR') +# funchook_dso is DSO for libfunchook.so or None if CPU is not supported. +def _(): + cpu = platform.machine() + if re.match('x86|i.86|x86_64|amd64', cpu, re.I): + cpu = 'x86' + disasm = 'distorm' + elif re.match('aarch64|arm64', cpu, re.I): + cpu = 'arm64' + disasm = 'capstone' + else: + return None # no funchook support + + # XXX temp test XXX no -> we need capstone for disasm + disasm = 'capstone' + + if platform.system() == 'Windows': + os = 'windows' + libv = ['psapi'] + else: + os = 'unix' + libv = ['dl'] + + FH = '3rdparty/funchook/' + srcv = [FH+'src/funchook.c', + FH+'src/funchook_%s.c' % cpu, + FH+'src/funchook_%s.c' % os, + FH+'src/disasm_%s.c' % disasm] + depv = [FH+'include/funchook.h', + FH+'src/disasm.h', + FH+'src/funchook_arm64.h', + FH+'src/funchook_internal.h', + FH+'src/funchook_x86.h'] + incv = [FH+'include'] + defv = ['FUNCHOOK_EXPORTS'] + + if disasm == 'distorm': + D3 = '3rdparty/funchook/distorm/' + srcv += [D3+'src/decoder.c', + D3+'src/distorm.c', + D3+'src/instructions.c', + D3+'src/insts.c', + D3+'src/mnemonics.c', + D3+'src/operands.c', + D3+'src/prefix.c', + D3+'src/textdefs.c'] + depv += [D3+'include/distorm.h', + D3+'include/mnemonics.h', + D3+'src/config.h', + D3+'src/decoder.h', + D3+'src/instructions.h', + D3+'src/insts.h', + D3+'src/operands.h', + D3+'src/prefix.h', + D3+'src/textdefs.h', + D3+'src/wstring.h', + D3+'src/x86defs.h'] + incv += [D3+'include'] + + if disasm == 'capstone': + CS = '3rdparty/capstone/' + srcv += [CS+'cs.c', + CS+'Mapping.c', + CS+'MCInst.c', + CS+'MCInstrDesc.c', + CS+'MCRegisterInfo.c', + CS+'SStream.c', + CS+'utils.c'] + depv += [CS+'cs_simple_types.h', + CS+'cs_priv.h', + CS+'LEB128.h', + CS+'Mapping.h', + CS+'MathExtras.h', + CS+'MCDisassembler.h', + CS+'MCFixedLenDisassembler.h', + CS+'MCInst.h', + CS+'MCInstrDesc.h', + CS+'MCRegisterInfo.h', + CS+'SStream.h', + CS+'utils.h'] + incv += [CS+'include'] + + depv += [CS+'include/capstone/arm64.h', + CS+'include/capstone/arm.h', + CS+'include/capstone/capstone.h', + CS+'include/capstone/evm.h', + CS+'include/capstone/wasm.h', + CS+'include/capstone/mips.h', + CS+'include/capstone/ppc.h', + CS+'include/capstone/x86.h', + CS+'include/capstone/sparc.h', + CS+'include/capstone/systemz.h', + CS+'include/capstone/xcore.h', + CS+'include/capstone/m68k.h', + CS+'include/capstone/tms320c64x.h', + CS+'include/capstone/m680x.h', + CS+'include/capstone/mos65xx.h', + CS+'include/capstone/bpf.h', + CS+'include/capstone/riscv.h', + CS+'include/capstone/sh.h', + CS+'include/capstone/tricore.h', + CS+'include/capstone/platform.h'] + + defv += ['CAPSTONE_SHARED', 'CAPSTONE_USE_SYS_DYN_MEM'] + + if cpu == 'arm64': + defv += ['CAPSTONE_HAS_ARM64'] + srcv += [CS+'arch/AArch64/AArch64BaseInfo.c', + CS+'arch/AArch64/AArch64Disassembler.c', + CS+'arch/AArch64/AArch64InstPrinter.c', + CS+'arch/AArch64/AArch64Mapping.c', + CS+'arch/AArch64/AArch64Module.c'] + depv += [CS+'arch/AArch64/AArch64AddressingModes.h', + CS+'arch/AArch64/AArch64BaseInfo.h', + CS+'arch/AArch64/AArch64Disassembler.h', + CS+'arch/AArch64/AArch64InstPrinter.h', + CS+'arch/AArch64/AArch64Mapping.h', + CS+'arch/AArch64/AArch64GenAsmWriter.inc', + CS+'arch/AArch64/AArch64GenDisassemblerTables.inc', + CS+'arch/AArch64/AArch64GenInstrInfo.inc', + CS+'arch/AArch64/AArch64GenRegisterInfo.inc', + CS+'arch/AArch64/AArch64GenRegisterName.inc', + CS+'arch/AArch64/AArch64GenRegisterV.inc', + CS+'arch/AArch64/AArch64GenSubtargetInfo.inc', + CS+'arch/AArch64/AArch64GenSystemOperands.inc', + CS+'arch/AArch64/AArch64GenSystemOperands_enum.inc', + CS+'arch/AArch64/AArch64MappingInsn.inc', + CS+'arch/AArch64/AArch64MappingInsnName.inc', + CS+'arch/AArch64/AArch64MappingInsnOp.inc'] + + if cpu == 'x86': + defv += ['CAPSTONE_HAS_X86'] + srcv += [CS+'arch/X86/X86ATTInstPrinter.c', # !diet + CS+'arch/X86/X86Disassembler.c', + CS+'arch/X86/X86DisassemblerDecoder.c', + CS+'arch/X86/X86IntelInstPrinter.c', + CS+'arch/X86/X86InstPrinterCommon.c', + CS+'arch/X86/X86Mapping.c', + CS+'arch/X86/X86Module.c'] + depv += [CS+'arch/X86/X86BaseInfo.h', + CS+'arch/X86/X86Disassembler.h', + CS+'arch/X86/X86DisassemblerDecoder.h', + CS+'arch/X86/X86DisassemblerDecoderCommon.h', + CS+'arch/X86/X86GenAsmWriter.inc', + CS+'arch/X86/X86GenAsmWriter1.inc', + CS+'arch/X86/X86GenAsmWriter1_reduce.inc', + CS+'arch/X86/X86GenAsmWriter_reduce.inc', + CS+'arch/X86/X86GenDisassemblerTables.inc', + CS+'arch/X86/X86GenDisassemblerTables_reduce.inc', + CS+'arch/X86/X86GenInstrInfo.inc', + CS+'arch/X86/X86GenInstrInfo_reduce.inc', + CS+'arch/X86/X86GenRegisterInfo.inc', + CS+'arch/X86/X86InstPrinter.h', + CS+'arch/X86/X86Mapping.h', + CS+'arch/X86/X86MappingInsn.inc', + CS+'arch/X86/X86MappingInsnOp.inc', + CS+'arch/X86/X86MappingInsnOp_reduce.inc', + CS+'arch/X86/X86MappingInsn_reduce.inc'] + + # config.h + probe = ProbeToolchain() + config_h = [] + def cfgemit(line): + config_h.append(line+'\n') + def defif(name, ok): + if ok: + cfgemit('#define %s 1' % name) + else: + cfgemit('#undef %s' % name) + + for d in ('capstone', 'distorm', 'zydis'): + defif('DISASM_%s' % d.upper(), d == disasm) + + cfgemit('#define SIZEOF_VOID_P %d' % probe.sizeof('void*')) + + defif('_GNU_SOURCE', 1) + defif('GNU_SPECIFIC_STRERROR_R', probe.try_compile(""" +#define _GNU_SOURCE 1 +#include +int main() +{ + char dummy[128]; + return *strerror_r(0, dummy, sizeof(dummy)); +} +""")) + + fbuild_src = 'build/3rdparty/funchook/src' + mkdir_p(fbuild_src) + writefile(fbuild_src+'/config.h', ''.join(config_h)) + incv += [fbuild_src] + + return DSO('golang.runtime.funchook', srcv, + depends = depv, + language = 'c', + include_dirs = incv, + define_macros = [(_, None) for _ in defv], + libraries = libv, + soversion = '1.1') +funchook_dso = _() + + setup( name = 'pygolang', version = version, @@ -225,6 +441,7 @@ def get_python_libdir(): ['golang/runtime/libgolang.cpp', 'golang/runtime/internal/atomic.cpp', 'golang/runtime/internal/syscall.cpp', + 'golang/runtime.cpp', 'golang/context.cpp', 'golang/errors.cpp', 'golang/fmt.cpp', @@ -236,9 +453,11 @@ def get_python_libdir(): 'golang/time.cpp'], depends = [ 'golang/libgolang.h', + 'golang/runtime.h', 'golang/runtime/internal.h', 'golang/runtime/internal/atomic.h', 'golang/runtime/internal/syscall.h', + 'golang/runtime/platform.h', 'golang/context.h', 'golang/cxx.h', 'golang/errors.h', @@ -259,12 +478,21 @@ def get_python_libdir(): include_dirs = [sysconfig.get_python_inc()], library_dirs = [get_python_libdir()], define_macros = [('BUILDING_LIBPYXRUNTIME', None)], - soversion = '0.1')], + soversion = '0.1')] + + ([funchook_dso] if funchook_dso else []), ext_modules = [ Ext('golang._golang', - ['golang/_golang.pyx'], - depends = ['golang/_golang_str.pyx']), + ['golang/_golang.pyx', + 'golang/_golang_str_pickle.S'], + depends = [ + 'golang/_golang_str.pyx', + 'golang/_golang_str_pickle.pyx', + 'golang/_golang_str_pickle_test.pyx', + 'golang/_golang_str_pickle.S'], + dsos = ['golang.runtime.funchook'], # XXX only if available + include_dirs = ['3rdparty/funchook/include', + '3rdparty/capstone/include']), Ext('golang.runtime._runtime_thread', ['golang/runtime/_runtime_thread.pyx']), @@ -334,6 +562,14 @@ def get_python_libdir(): Ext('golang._time', ['golang/_time.pyx'], dsos = ['golang.runtime.libpyxruntime']), + + # XXX consider putting everything into just gpython.pyx + .c + Ext('gpython._gpython', + ['gpython/_gpython.pyx', + 'gpython/_gpython_c.cpp'], # XXX do we need C++ here? + include_dirs = ['3rdparty/funchook/include'], + dsos = ['golang.runtime.funchook'], # XXX only if available + ), ], include_package_data = True, From e035c704d6fc44abf2c2c6853bfb5609205725a9 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Tue, 30 Jan 2024 15:07:53 +0300 Subject: [PATCH 20/29] X Bring y/bstr+x/gpystr to be at least usable Asked by Kazuhiko: https://lab.nexedi.com/nexedi/pygolang/-/merge_requests/21#note_198526 --- golang/_golang_str.pyx | 2 +- gpython/__init__.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index d7282a0..78bbd40 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -2402,7 +2402,7 @@ cdef _pytype_replace_by_child(PyTypeObject *typ, PyTypeObject *typ_clone, assert isinstance(x, type) xtyp = x _xtyp = <_XPyTypeObject*>x - fprintf(stderr, 'refreshing %s\n', xtyp.tp_name) + #fprintf(stderr, 'refreshing %s\n', xtyp.tp_name) assert (xtyp.tp_flags & Py_TPFLAGS_READY) != 0 xtyp.tp_flags &= ~Py_TPFLAGS_READY Py_CLEAR(_xtyp.tp_mro) diff --git a/gpython/__init__.py b/gpython/__init__.py index 14b2fce..f2225f8 100755 --- a/gpython/__init__.py +++ b/gpython/__init__.py @@ -474,7 +474,6 @@ def init(): from six.moves import builtins for k in golang.__all__: setattr(builtins, k, getattr(golang, k)) - setattr(builtins, 'CCC', CCC) # sys.version sys.version += (' [GPython %s] [runtime %s] [strings %s]' % (golang.__version__, gpy_runtime_ver, gpy_strings)) @@ -596,7 +595,7 @@ def __next__(self): # for tests XXX continue by first writing test XXX -1/0 +#1/0 class _tEarlyStrSubclass(str): pass From ac87a2ed0ca039ceeda98dcaf7445cdee03f3cd0 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Wed, 24 Apr 2024 19:38:42 +0300 Subject: [PATCH 21/29] X Update on my draft state of x/gpystr work Please see demo/pickle_py2_gpy3_demo.py and demo/ZODB_py2_gpy3_demo.py for details of how pickle compatibility problem is solved in between py2 and py3. --- demo/.gitignore | 2 + demo/ZODB_py2_gpy3_demo.py | 55 +++++ demo/pickle_py2_gpy3_demo.py | 68 ++++++ golang/__init__.py | 6 +- golang/_golang.pyx | 2 +- golang/_golang_str.pyx | 209 +++++++++++++++++-- golang/_golang_str_pickle.S | 8 +- golang/_golang_str_pickle.pyx | 311 ++++++++++++++++++++++++---- golang/_golang_str_pickle_test.pyx | 20 +- golang/_gopath.py | 40 +++- golang/_strconv.pyx | 2 +- golang/fmt.h | 2 +- golang/golang_str_pickle_test.py | 286 ++++++++++++++++++------- golang/golang_str_test.py | 47 ++++- golang/golang_test.py | 6 + golang/libgolang.h | 2 +- golang/os.cpp | 2 +- golang/os.h | 2 +- golang/os/signal.cpp | 2 +- golang/pyx/build.py | 2 +- golang/runtime.cpp | 4 +- golang/runtime.h | 4 +- golang/runtime/internal/atomic.cpp | 2 +- golang/runtime/internal/syscall.cpp | 2 +- golang/runtime/internal/syscall.h | 2 +- golang/runtime/libgolang.cpp | 2 +- golang/runtime/platform.h | 4 +- gpython/__init__.py | 12 +- gpython/_gpython.pyx | 4 +- gpython/_gpython_c.cpp | 4 +- gpython/gpython_test.py | 2 +- pyproject.toml | 2 +- setup.py | 7 +- tox.ini | 12 +- trun | 22 +- 35 files changed, 952 insertions(+), 207 deletions(-) create mode 100644 demo/.gitignore create mode 100755 demo/ZODB_py2_gpy3_demo.py create mode 100755 demo/pickle_py2_gpy3_demo.py diff --git a/demo/.gitignore b/demo/.gitignore new file mode 100644 index 0000000..a7a5eac --- /dev/null +++ b/demo/.gitignore @@ -0,0 +1,2 @@ +x.pkl +data.fs* diff --git a/demo/ZODB_py2_gpy3_demo.py b/demo/ZODB_py2_gpy3_demo.py new file mode 100755 index 0000000..45a2405 --- /dev/null +++ b/demo/ZODB_py2_gpy3_demo.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Program ZODB_py2_gpy3_demo demonstrates interoperability in between py2 and py3 +# regarding pickled strings in ZODB. +# +# It is similar to pickle_py2_gpy3_demo, but persists data inside ZODB instead +# of raw pickle file. +# +# Please see pickle_py2_gpy3_demo for details. + +from __future__ import print_function + +from persistent import Persistent +from ZODB.FileStorage import FileStorage +from ZODB.DB import DB +import transaction + +from zodbpickle import fastpickle as pickle +import pickletools +import sys + + +class MyClass(Persistent): + __slots__ = ('data',) + +def main(): + print(sys.version) + + # adjust FileStorage magic so that py3 does not refuse to load FileStorage produced on py2 + fsmod = __import__('ZODB.FileStorage.FileStorage', fromlist=['ZODB']) + assert hasattr(fsmod, 'packed_version') + fsmod.packed_version = b'FS21' + + stor = FileStorage('data.fs') + db = DB(stor) + conn = db.open() + root = conn.root + + if not hasattr(root, 'obj'): + root.obj = obj = MyClass() + obj.data = u'αβγ'.encode('utf-8') + else: + print('\nloading data:') + obj = root.obj + print('\n-> %r\t(%s)' % (obj.data, obj.data)) + + obj.data += b' %d' % len(obj.data) + + print('\nsaving data: %r\t(%s)' % (obj.data, obj.data)) + transaction.commit() + + +if __name__ == '__main__': + main() diff --git a/demo/pickle_py2_gpy3_demo.py b/demo/pickle_py2_gpy3_demo.py new file mode 100755 index 0000000..d107675 --- /dev/null +++ b/demo/pickle_py2_gpy3_demo.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Program pickle_py2_gpy3_demo demonstrates interoperability in between py2 and py3 +# regarding pickled strings. +# +# It initially saves non-ASCII string in pickled form into a file, and on +# further runs tries to load saved object back, appends some tail data to it, +# and saves the result again. +# +# When run on plain py2 everything works as expected: string is initially +# persisted ok, then loaded ok as the same str object, which can be worked with +# as expected, and persisted again ok. +# +# When plain py3 runs this program on the file prepared by py2, loading pickle +# data breaks because, by default, py3 wants to decode *STRING opcodes as ASCII +# and the saved string is not ASCII. +# +# However when run under gpy3, the string is loaded ok as bstr. Since bstr has the +# same semantic as regular str on py2, working with that object produces the +# same result plain py2 would produce when adjusting the data. And then, bstr +# is also persisted ok and via the same *STRING opcodes, that py2 originally +# used for the data. +# +# This way both py2 and gpy3 can interoperate on the same database: py2 can +# produce data, gpy3 can read the data and modify it, and further py2 can load +# updated data, again, just ok. + +from __future__ import print_function + +from zodbpickle import fastpickle as pickle +import pickletools +from os.path import exists +import sys + +def main(): + stor = 'x.pkl' + + print(sys.version) + + if not exists(stor): + obj = u'αβγ'.encode('utf-8') + else: + pkl = readfile(stor) + print('\nloading pickle:') + pickletools.dis(pkl) + obj = pickle.loads(pkl) + print('\n-> %r\t(%s)' % (obj, obj)) + + obj += b' %d' % len(obj) + + print('\nsaving obj: %r\t(%s)' % (obj, obj)) + pkl = pickle.dumps(obj) + pickletools.dis(pkl) + writefile(stor, pkl) + + +def readfile(path): + with open(path, 'rb') as f: + return f.read() + +def writefile(path, data): + with open(path, 'wb') as f: + f.write(data) + + +if __name__ == '__main__': + main() diff --git a/golang/__init__.py b/golang/__init__.py index e773775..00babf6 100644 --- a/golang/__init__.py +++ b/golang/__init__.py @@ -38,13 +38,13 @@ __all__ = ['go', 'chan', 'select', 'default', 'nilchan', 'defer', 'panic', 'recover', 'func', 'error', 'b', 'u', 'bstr', 'ustr', 'bbyte', 'uchr', 'gimport'] +import setuptools_dso +setuptools_dso.dylink_prepare_dso('golang.runtime.libgolang') + from golang._gopath import gimport # make gimport available from golang import inspect, sys import decorator, six -import setuptools_dso -setuptools_dso.dylink_prepare_dso('golang.runtime.libgolang') - from golang._golang import _pysys_exc_clear as _sys_exc_clear # @func is a necessary decorator for functions for selected golang features to work. diff --git a/golang/_golang.pyx b/golang/_golang.pyx index b857197..689d6a1 100644 --- a/golang/_golang.pyx +++ b/golang/_golang.pyx @@ -5,7 +5,7 @@ # distutils: language = c++ # distutils: depends = libgolang.h os/signal.h unicode/utf8.h _golang_str.pyx _golang_str_pickle.pyx # -# Copyright (C) 2018-2023 Nexedi SA and Contributors. +# Copyright (C) 2018-2024 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index 78bbd40..3e4a64f 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Nexedi SA and Contributors. +# Copyright (C) 2018-2024 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -34,7 +34,9 @@ from cpython.iterobject cimport PySeqIter_New from cpython cimport PyThreadState_GetDict, PyDict_SetItem from cpython cimport PyObject_CheckBuffer from cpython cimport Py_TPFLAGS_HAVE_GC, Py_TPFLAGS_HEAPTYPE, Py_TPFLAGS_READY, PyType_Ready +from cpython cimport Py_TPFLAGS_VALID_VERSION_TAG from cpython cimport PyBytes_Format, PyUnicode_Format, PyObject_Str +from cpython cimport PyObject_GetAttr, PyObject_SetAttr cdef extern from "Python.h": PyTypeObject PyBytes_Type @@ -408,7 +410,6 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 else: return pyb(x) - # XXX temp disabled # __iter__ - yields unicode characters def __iter__(self): # TODO iterate without converting self to u @@ -1145,7 +1146,7 @@ cdef _bstringify(object obj): # -> unicode|bytes _bstringify_enter() try: - if False: # PY_MAJOR_VERSION >= 3: + if False: # PY_MAJOR_VERSION >= 3: # XXX restore ? # NOTE this depends on patches to bytes.{__repr__,__str__} below return unicode(obj) @@ -1251,7 +1252,7 @@ def _(): cdef PyTypeObject* t # NOTE patching bytes and its already-created subclasses that did not override .tp_repr/.tp_str # NOTE if we don't also patch __dict__ - e.g. x.__repr__() won't go through patched .tp_repr - for pyt in [bytes] + bytes.__subclasses__(): + for pyt in [bytes] + bytes.__subclasses__(): # FIXME also handle sub-sub-classes assert isinstance(pyt, type) t = pyt if t.tp_repr == _bytes_tp_repr: @@ -1264,7 +1265,7 @@ _() if PY_MAJOR_VERSION < 3: def _(): - cdef PyTypeObject* t + cdef PyTypeObject* t # FIXME also handle sub-sub-classes for pyt in [unicode] + unicode.__subclasses__(): assert isinstance(pyt, type) t = pyt @@ -1301,7 +1302,7 @@ cdef object _unicode_x__ge__(object a, object b): return _unicode_tp_richcompa if PY_MAJOR_VERSION < 3: def _(): cdef PyTypeObject* t - for pyt in [unicode] + unicode.__subclasses__(): + for pyt in [unicode] + unicode.__subclasses__(): # XXX sub-sub-classes assert isinstance(pyt, type) t = pyt if t.tp_richcompare == _unicode_tp_richcompare: @@ -1385,7 +1386,7 @@ def _bytearray_x__iadd__(a, b): return _bytearray_sq_xiconcat(a, b) def _(): cdef PyTypeObject* t - for pyt in [bytearray] + bytearray.__subclasses__(): + for pyt in [bytearray] + bytearray.__subclasses__(): # XXX sub-sub-classes assert isinstance(pyt, type) t = pyt if t.tp_repr == _bytearray_tp_repr: @@ -1408,7 +1409,7 @@ def _(): _() -# _bytearray_data return raw data in bytearray as bytes. +# _bytearray_data returns raw data in bytearray as bytes. # XXX `bytearray s` leads to `TypeError: Expected bytearray, got hbytearray` cdef bytes _bytearray_data(object s): if PY_MAJOR_VERSION >= 3: @@ -1849,6 +1850,7 @@ class _BFormatter(pystring.Formatter): # XXX place, comments # str % ... : ceval on py2 and py3 < 3.11 invokes PyString_Format / PyUnicode_Format # directly upon seeing BINARY_MODULO. This leads to bstr.__mod__ not being called. +# XXX -> patch PyString_Format / PyUnicode_Format to invoke our .__mod__ ... ctypedef unicode uformatfunc(object, object) ctypedef bytes bformatfunc(object, object) cdef uformatfunc* _punicode_Format = PyUnicode_Format @@ -1867,7 +1869,7 @@ cdef _patch_capi_str_format(): # XXX place, comments, test -#py3.11: specializes instructions. e.g. ustr(obj) will specialize (after +# py3.11: specializes instructions. e.g. ustr(obj) will specialize (after # executing 8 times) to directly invoke # # PyObject_Str(obj) @@ -1889,6 +1891,37 @@ cdef _patch_capi_object_str(): cpatch(&_pobject_Str, _object_xStr) +# XXX place, comments, test +# on py3 PyObject_GetAttr & co insist on name to be unicode +# XXX _PyObject_LookupAttr +# XXX _PyObject_GenericGetAttrWithDict +# XXX _PyObject_GenericSetAttrWithDict +# XXX type_getattro +IF PY3: + ctypedef object obj_getattr_func(object, object) + ctypedef int obj_setattr_func(object, object, object) except -1 + + cdef obj_getattr_func* _pobject_GetAttr = PyObject_GetAttr + cdef obj_setattr_func* _pobject_SetAttr = PyObject_SetAttr + + cdef object _object_xGetAttr(object obj, object name): +# fprintf(stderr, "xgetattr...\n") + if isinstance(name, pybstr): + name = pyustr(name) + return _pobject_GetAttr(obj, name) + + cdef int _object_xSetAttr(object obj, object name, object v) except -1: +# fprintf(stderr, "xsetattr...\n") + if isinstance(name, pybstr): + name = pyustr(name) + return _pobject_SetAttr(obj, name, v) + + +cdef _patch_capi_object_attr_bstr(): + IF PY3: + cpatch(&_pobject_GetAttr, _object_xGetAttr) + cpatch(&_pobject_SetAttr, _object_xSetAttr) + # ---- misc ---- @@ -2213,6 +2246,7 @@ cdef _patch_str(): upreserve_slots) pyustr = unicode # retarget pyustr -> unicode to where it was copied # XXX vvv needed so that patched unicode could be saved by py2:cPickle at all + # XXX vvv should be done by pytype_replace... ? just us original unicode.tp_name ? (pyustr).tp_name = ("unicode" if PY_MAJOR_VERSION < 3 else "str") # py2: patch str to be pybstr @@ -2248,6 +2282,7 @@ cdef _patch_str(): _patch_capi_str_format() _patch_capi_object_str() + _patch_capi_object_attr_bstr() _patch_capi_unicode_decode_as_bstr() _patch_str_pickle() # ... @@ -2259,16 +2294,16 @@ cdef _patch_str(): include '_golang_str_pickle.pyx' # _pytype_clone clones PyTypeObject src into dst. -# dst must not be previously initialized. # -# dst will have reference-count = 1 meaning new reference to it is returned. +# src must be not heap-allocated type. +# dst must be statically allocated and not previously initialized. +# +# dst will have reference-count = 1 meaning new reference to the clone is returned. cdef _pytype_clone(PyTypeObject *src, PyTypeObject *dst, const char* new_name): assert (src.tp_flags & Py_TPFLAGS_READY) != 0 assert (src.tp_flags & Py_TPFLAGS_HEAPTYPE) == 0 # src is not allocated on heap - #assert not PyType_IS_GC((src).ob_type) # XXX not true as unicode.ob_type is PyType_Type - # which generally has GC support, but - # GC is deactivated for non-heap types. - # copy the struct XXX + .ob_next / .ob_prev (Py_TRACE_REFS) + # and so GC for it is disabled + # copy the struct XXX + ._ob_next / ._ob_prev (Py_TRACE_REFS) (set to NULL) dst[0] = src[0] (dst).ob_refcnt = 1 @@ -2277,6 +2312,7 @@ cdef _pytype_clone(PyTypeObject *src, PyTypeObject *dst, const char* new_name): # now reinitialize things like .tp_dict etc, where PyType_Ready built slots that point to src. # we want all those slots to be rebuilt and point to dst instead. + # XXX test _dst = <_XPyTypeObject*>dst dst .tp_flags &= ~Py_TPFLAGS_READY dst .tp_dict = NULL @@ -2286,10 +2322,17 @@ cdef _pytype_clone(PyTypeObject *src, PyTypeObject *dst, const char* new_name): _dst.tp_weaklist = NULL # dst.__subclasses__ will be empty because existing children inherit from src, not from dst. + # XXX but ustr, after copy to unicode, will inherit from unicode(pystd) -- recheck + # XXX test _dst.tp_subclasses = NULL + # XXX -> common reinherit fixup + if _dst.tp_init == (<_XPyTypeObject*>(dst.tp_base)).tp_init: + _dst.tp_init = NULL + PyType_Ready(dst) assert (dst.tp_flags & Py_TPFLAGS_READY) != 0 + assert (dst.tp_flags & Py_TPFLAGS_HEAPTYPE) == 0 # _pytype_replace_by_child replaces typ by its child egg. # @@ -2305,8 +2348,10 @@ cdef _pytype_clone(PyTypeObject *src, PyTypeObject *dst, const char* new_name): # ↑ ↑ # Y Y # +# typ and egg must be static non heap-allocated types. +# # typ_clone must be initialized via _pytype_clone(typ, typ_clone). -# egg' is egg clone put inplace of typ +# egg' is egg clone put inplace of typ. # # XXX preserve_slots - describe cdef _pytype_replace_by_child(PyTypeObject *typ, PyTypeObject *typ_clone, @@ -2323,15 +2368,11 @@ cdef _pytype_replace_by_child(PyTypeObject *typ, PyTypeObject *typ_clone, assert (egg.tp_flags & Py_TPFLAGS_READY) != 0 assert (typ.tp_flags & Py_TPFLAGS_HEAPTYPE) == 0 - assert (egg.tp_flags & Py_TPFLAGS_HEAPTYPE) == 0 # XXX will be not true - # -> ! Py_TPFLAGS_HAVE_GC - # -> ? set Py_TPFLAGS_HEAPTYPE back on typ' ? + assert (egg.tp_flags & Py_TPFLAGS_HEAPTYPE) == 0 # (generally not required) assert (typ.tp_flags & Py_TPFLAGS_HAVE_GC) == 0 assert (egg.tp_flags & Py_TPFLAGS_HAVE_GC) == 0 - # XXX also check PyObject_IS_GC (verifies .tp_is_gc() = n) ? - assert vtyp.ob_size == vegg.ob_size assert typ .tp_basicsize == egg .tp_basicsize @@ -2353,11 +2394,14 @@ cdef _pytype_replace_by_child(PyTypeObject *typ, PyTypeObject *typ_clone, Py_CLEAR(_egg.tp_bases) Py_CLEAR(_egg.tp_mro) Py_CLEAR(_egg.tp_cache) + # XXX 3.12 +tp_watched # typ <- egg preserving original typ's refcnt, weak references and subclasses\egg. # typ will be now playing the role of egg typ_refcnt = otyp.ob_refcnt + # XXX py3.12 "For the static builtin types this is always NULL, even if weakrefs are added ..." typ_weaklist = _typ.tp_weaklist + # XXX py3.12 "May be an invalid pointer" (for static builtin types it became `size_t index` typ_subclasses = _typ.tp_subclasses typ[0] = egg[0] otyp.ob_refcnt = typ_refcnt @@ -2376,6 +2420,63 @@ cdef _pytype_replace_by_child(PyTypeObject *typ, PyTypeObject *typ_clone, # live in .tp_dict and point to their type. Do it for both typ (new egg) # and origin egg for generality, even though original egg won't be used # anymore. + # + # XXX also check which pointers/other things are propagated from base to + # subclasses. It is e.g. tp_new but others might be as well. + # + # https://docs.python.org/3/c-api/typeobj.html -> inheritance + defaults: + # + # D(default): + # tp_base X + # tp_dict ? + # tp_alloc ? + # tp_new ? + # tp_free ? + # + # ~ + # ~ + # + # I(inherited): + # ob_type == &PyType_Type + # + tp_basicsize == + # + tp_itemsize == + # tp_dealloc + # + tp_vectorcall_offset == + # tp_getattr / tp_getattro + # tp_setattr / tp_setattro NULL + # tp_as_async NULL + # tp_repr + # tp_as_number for % + # tp_as_sequence len concat repeat sq_item contains ... + # tp_as_mapping len subscript + # tp_hash / tp_richcompare + # tp_call NULL + # tp_str + # tp_as_buffer NULL(unicode) !NULL(ustr) + # tp_flags XXX recheck how flags are rebuild by PyTypes_Ready + # tp_traverse / tp_clear NULL <- Py_TPFLAGS_HAVE_GC + # tp_clear NULL + # + tp_weaklistoffset + # tp_iter + # tp_iternext NULL + # tp_descr_get NULL + # tp_descr_set NULL + # + tp_dictoffset 0 + # tp_init NULL + # tp_alloc == (PyType_GenericAlloc) + # tp_new + # tp_free XXX recheck + # tp_is_gc NULL + # tp_finalize NULL + # + # XXX also check PyHeapTypeObject + + # don't let PyType_Ready to create __init__ if tp_init is inherited + if _typ.tp_init == (<_XPyTypeObject*>(typ.tp_base)).tp_init: + _typ.tp_init = NULL + if _egg.tp_init == (<_XPyTypeObject*>(egg.tp_base)).tp_init: + _egg.tp_init = NULL + typ.tp_flags &= ~Py_TPFLAGS_READY egg.tp_flags &= ~Py_TPFLAGS_READY PyType_Ready(typ) @@ -2398,11 +2499,72 @@ cdef _pytype_replace_by_child(PyTypeObject *typ, PyTypeObject *typ_clone, # initially X.__mro__ = (X, typ, base) and without rebuilding it would # remain (X, egg', base) instead of correct (X, egg' typ_clone, base) # XXX py3 does this automatically? XXX -> no, it can invalidate .__mro__, but not .tp_mro + + # refresh fields related to X inheriting from its base. + # currents state of base is Bnew. + # old state of base is represented by Bold. + # NOTE for first-level children of typ Bnew=egg' and Bold=typ_clone + # for further levels Bnew=bold + def inherit_refresh(X, Bold, Bnew): + # depth-first + for Y in X.__subclasses__(): + inherit_refresh(Y, X, X) + assert isinstance(Bold, type) + assert isinstance(Bnew, type) + assert isinstance(X, type) + o = Bold ; _o = <_XPyTypeObject*>Bold + b = Bnew ; _b = <_XPyTypeObject*>Bnew + x = X ; _x = <_XPyTypeObject*>X +# fprintf(stderr, 'refresh %s\t<- %s', x.tp_name, b.tp_name) +# if Bold is not Bnew: +# fprintf(stderr, '\t# was <- %s', o.tp_name) +# fprintf(stderr, '\n') + assert (x.tp_flags & Py_TPFLAGS_READY) != 0 + x.tp_flags &= ~Py_TPFLAGS_READY + + xdict = (x.tp_dict) + def clear(slotname): + del xdict[slotname] +# Py_CLEAR(_x.tp_dict) # XXX preserve some ? +# Py_CLEAR(_x.tp_bases) # to be rebuilt XXX not ok to clear wrt multi-inheritance XXX test + Py_CLEAR(_x.tp_mro) # ----//---- + Py_CLEAR(_x.tp_cache) # ----//---- + + if _x.tp_new == _o.tp_new: + _x.tp_new = NULL # reinherit from Bnew on reready + # del xdict['__new__'] XXX raises KeyError - why? + if _x.tp_init == _o.tp_init: # XXX also check other bases from mro (ex. StrEnum(str,Enum) which has Enum.__init__) +# fprintf(stderr, ' tp_init <- NULL\n') + _x.tp_init = NULL + #clear('__init__') XXX + + def inherit_reready(X): + assert isinstance(X, type) + x = X +# fprintf(stderr, 'ready %s\n', x.tp_name) + assert (x.tp_flags & Py_TPFLAGS_READY) == 0 + PyType_Ready(X) + assert (x.tp_flags & Py_TPFLAGS_READY) != 0 + + # top-down + for Y in X.__subclasses__(): + inherit_reready(Y) + + assert (x.tp_flags & Py_TPFLAGS_VALID_VERSION_TAG) != 0 + + for X in (typ).__subclasses__(): + inherit_refresh(X, typ_clone, typ) + for X in (typ).__subclasses__(): + inherit_reready(X) + + PyType_Modified(typ) # XXX needed ? + + """ def refresh(x): assert isinstance(x, type) xtyp = x _xtyp = <_XPyTypeObject*>x - #fprintf(stderr, 'refreshing %s\n', xtyp.tp_name) + fprintf(stderr, 'refreshing %s\n', xtyp.tp_name) assert (xtyp.tp_flags & Py_TPFLAGS_READY) != 0 xtyp.tp_flags &= ~Py_TPFLAGS_READY Py_CLEAR(_xtyp.tp_mro) @@ -2410,7 +2572,8 @@ cdef _pytype_replace_by_child(PyTypeObject *typ, PyTypeObject *typ_clone, assert (xtyp.tp_flags & Py_TPFLAGS_READY) != 0 for _ in x.__subclasses__(): refresh(_) - for _ in (typ).__subclasses__(): + for _ in (typ).__subclasses__(): # XXX + sub-sub-classes refresh(_) + """ # XXX also preserve ._ob_next + ._ob_prev (present in Py_TRACE_REFS builds) diff --git a/golang/_golang_str_pickle.S b/golang/_golang_str_pickle.S index 3b954bc..c889069 100644 --- a/golang/_golang_str_pickle.S +++ b/golang/_golang_str_pickle.S @@ -1,5 +1,5 @@ -// Copyright (C) 2023 Nexedi SA and Contributors. -// Kirill Smelkov +// Copyright (C) 2023-2024 Nexedi SA and Contributors. +// Kirill Smelkov // // This program is free software: you can Use, Study, Modify and Redistribute // it under the terms of the GNU General Public License version 3, or (at your @@ -288,7 +288,7 @@ inside_counted_stk: // disable executable stack -#ifndef LIBGOLANG_OS_windows +#ifdef LIBGOLANG_OS_linux .section .note.GNU-stack,"",@progbits #endif @@ -304,7 +304,7 @@ inside_counted_stk: #if defined(LIBGOLANG_ARCH_386) -#ifdef LIBGOLANG_CC_msc +#ifdef LIBGOLANG_OS_windows // both msvc and clang-cl # define CSYM_FASTCALL3(name) @name@12 // MSVC mangles __fastcall # define CSYM_FASTCALL4(name) @name@16 #else diff --git a/golang/_golang_str_pickle.pyx b/golang/_golang_str_pickle.pyx index ec091c2..e6918fc 100644 --- a/golang/_golang_str_pickle.pyx +++ b/golang/_golang_str_pickle.pyx @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2023 Nexedi SA and Contributors. -# Kirill Smelkov +# Copyright (C) 2023-2024 Nexedi SA and Contributors. +# Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute # it under the terms of the GNU General Public License version 3, or (at your @@ -27,6 +27,7 @@ The main entry-points are _patch_str_pickle and _patch_capi_unicode_decode_as_bs from cpython cimport PyUnicode_Decode from cpython cimport PyBytes_FromStringAndSize, _PyBytes_Resize +from cpython cimport PyObject_CallObject, PyObject_CallFunctionObjArgs cdef extern from "Python.h": char* PyBytes_AS_STRING(PyObject*) @@ -130,6 +131,8 @@ cdef struct PicklerTypeInfo: Py_ssize_t off_poutput_buffer # offsetof `PyObject *output_buffer` Py_ssize_t off_output_len # offsetof `Py_ssize_t output_len` Py_ssize_t off_max_output_len # offsetof `Py_ssize_t max_output_len` + Py_ssize_t off_pers_func # offsetof `PyObject *pers_func` + Py_ssize_t off_pers_func_self # offsetof `PyObject *pers_func_self` or -1 if this field is not there # XXX place ? @@ -147,36 +150,61 @@ cdef extern from * nogil: // FOR_EACH_CALLCONV invokes macro X(ccname, callconv, cckind) for every supported calling convention. - // cckind is one of `builtin` or `custom`. + // cckind is one of `builtin`, `custom` or `builtin_psave0`. + // + // - `builtin` represents native calling conventions of the compiler + // available to the programmer via function attributes. + // - `custom` represents custom calling convention for which there is no + // public attribute and via-assembly proxy needs to be used to call such function. + // - `builtin_psave0` represents native calling convention, but indicates + // that the third argument of `save` was const-propagated with `pers_save=0`. + // + // NOTE: psave0 variants go last so that !constprop versions have higher priority to be probed. #ifdef LIBGOLANG_ARCH_386 # ifndef LIBGOLANG_CC_msc # define FOR_EACH_CALLCONV(X) \ - X(default,, builtin) \ - X(cdecl, CALLCONV(cdecl), builtin) \ - X(stdcall, CALLCONV(stdcall), builtin) \ - X(fastcall, CALLCONV(fastcall), builtin) \ - X(thiscall, CALLCONV(thiscall), builtin) \ - X(regparm1, CALLCONV(regparm(1)), builtin) \ - X(regparm2, CALLCONV(regparm(2)), builtin) \ - X(regparm3, CALLCONV(regparm(3)), builtin) \ - X(fastcall_nostkclean, na, custom ) + X(default,, builtin) \ + X(cdecl, CALLCONV(cdecl), builtin) \ + X(stdcall, CALLCONV(stdcall), builtin) \ + X(fastcall, CALLCONV(fastcall), builtin) \ + X(thiscall, CALLCONV(thiscall), builtin) \ + X(regparm1, CALLCONV(regparm(1)), builtin) \ + X(regparm2, CALLCONV(regparm(2)), builtin) \ + X(regparm3, CALLCONV(regparm(3)), builtin) \ + X(fastcall_nostkclean, na, custom ) \ + X(default_psave0,, builtin_psave0) \ + X(cdecl_psave0, CALLCONV(cdecl), builtin_psave0) \ + X(stdcall_psave0, CALLCONV(stdcall), builtin_psave0) \ + X(fastcall_psave0, CALLCONV(fastcall), builtin_psave0) \ + X(thiscall_psave0, CALLCONV(thiscall), builtin_psave0) \ + X(regparm1_psave0, CALLCONV(regparm(1)), builtin_psave0) \ + X(regparm2_psave0, CALLCONV(regparm(2)), builtin_psave0) \ + X(regparm3_psave0, CALLCONV(regparm(3)), builtin_psave0) # else // MSC # define FOR_EACH_CALLCONV(X) \ - X(default,, builtin) \ - X(cdecl, CALLCONV(cdecl), builtin) \ - X(stdcall, CALLCONV(stdcall), builtin) \ - X(fastcall, CALLCONV(fastcall), builtin) \ - /* X(CALLCONV(thiscall), thiscall) MSVC emits "C3865: '__thiscall': can only be used on native member functions" */ \ + X(default,, builtin) \ + X(cdecl, CALLCONV(cdecl), builtin) \ + X(stdcall, CALLCONV(stdcall), builtin) \ + X(fastcall, CALLCONV(fastcall), builtin) \ + /* X(thiscall, CALLCONV(thiscall), builtin) MSVC emits "C3865: '__thiscall': can only be used on native member functions" */ \ /* in theory we can emulate thiscall via fastcall https://tresp4sser.wordpress.com/2012/10/06/how-to-hook-thiscall-functions/ */ \ - X(vectorcall, CALLCONV(vectorcall), builtin) \ - X(fastcall_nostkclean, na, custom ) + X(vectorcall, CALLCONV(vectorcall), builtin) \ + X(fastcall_nostkclean, na, custom ) \ + X(default_psave0,, builtin_psave0) \ + X(cdecl_psave0, CALLCONV(cdecl), builtin_psave0) \ + X(stdcall_psave0, CALLCONV(stdcall), builtin_psave0) \ + X(fastcall_psave0, CALLCONV(fastcall), builtin_psave0) \ + /* X(thiscall_psave0, CALLCONV(thiscall), builtin_psave0) */ \ + X(vectorcall_psave0, CALLCONV(vectorcall), builtin_psave0) # endif #elif defined(LIBGOLANG_ARCH_amd64) # define FOR_EACH_CALLCONV(X) \ - X(default,, builtin) + X(default,, builtin) \ + X(default_psave0,, builtin_psave0) #elif defined(LIBGOLANG_ARCH_arm64) # define FOR_EACH_CALLCONV(X) \ - X(default,, builtin) + X(default,, builtin) \ + X(default_psave0,, builtin_psave0) #else # error "unsupported architecture" #endif @@ -221,6 +249,7 @@ cdef struct _pickle_PatchCtx: SaveFunc Pickler_save_orig # what was there before PicklerTypeInfo iPickler # information detected about PicklerObject type + PyObject* pymod # module of the patched type # patch contexts for _pickle and _zodbpickle modules @@ -234,7 +263,7 @@ cdef _pickle_PatchCtx _zpickle_patchctx # # - *STRING are loaded as bstr # - bstr is saved as *STRING -# - pickletools decodes *STRING as UTF-8 +# - pickletools decodes *STRING and related opcodes as UTF-8b cdef _patch_str_pickle(): try: import zodbpickle @@ -246,6 +275,9 @@ cdef _patch_str_pickle(): if PY_MAJOR_VERSION >= 3: import pickletools, codecs _codecs_escape_decode = codecs.escape_decode + def xread_stringnl_noescape(f): + data = pickletools.read_stringnl(f, decode=False, stripquotes=False) + return pybstr(data) def xread_stringnl(f): data = _codecs_escape_decode(pickletools.read_stringnl(f, decode=False))[0] return pybstr(data) @@ -256,13 +288,15 @@ cdef _patch_str_pickle(): data = pickletools.read_string4(f).encode('latin1') return pybstr(data) + pickletools.stringnl_noescape.reader = xread_stringnl_noescape pickletools.stringnl.reader = xread_stringnl pickletools.string1.reader = xread_string1 pickletools.string4.reader = xread_string4 if zodbpickle: from zodbpickle import pickletools_3 as zpickletools - zpickletools.stringnl.reader = xread_stringnl # was same logic as in std pickletools + zpickletools.stringnl_noescape.reader = xread_stringnl_noescape # was same logic + zpickletools.stringnl.reader = xread_stringnl # as in std pickletools zpickletools.string1.reader = xread_string1 zpickletools.string4.reader = xread_string4 @@ -323,7 +357,7 @@ cdef _patch_pickle(pickle, _pickle, _pickle_PatchCtx* _pctx): pickle.loads = _pickle.loads pickle.Unpickler = _pickle.Unpickler pickle.dump = _pickle.dump - pickle.dumps = _pickle.dumps # XXX needed? + pickle.dumps = _pickle.dumps pickle.Pickler = _pickle.Pickler # patch py @@ -376,6 +410,18 @@ cdef _patch_pypickle(pickle, shadowed): self.memoize(obj) Pickler.dispatch[pybstr] = save_bstr + # adjust Pickler to save persistent ID in protocol 0 as UTF-8 + Pickler_save_pers = Pickler.save_pers + def save_pers(self, pid): + if self.proto >= 1: + Pickler_save_pers(self, pid) + else: + pid_str = pybstr(pid) + if b'\n' in pid_str: + raise pickle.PicklingError(r'persistent ID contains \n') + self.write(b'P' + pid_str + b'\n') + Pickler.save_pers = save_pers + # _patch_cpickle serves _patch_pickle for C version. cdef _patch_cpickle(_pickle, _pickle_PatchCtx *pctx): # adjust load / loads to use 'bstr' encoding by default @@ -417,6 +463,10 @@ cdef _patch_cpickle(_pickle, _pickle_PatchCtx *pctx): assert xsave.cconv == save.cconv, (callconv_str(xsave.cconv), callconv_str(save.cconv)) cpatch(&pctx.Pickler_save_orig.addr, xsave.addr) + # remember the module of patched type + pctx.pymod = _pickle + Py_INCREF(_pickle) # stays alive forever + # XXX test at runtime that we hooked save correctly @@ -454,8 +504,9 @@ cdef _patch_capi_unicode_decode_as_bstr(): # ---- adjusted C bits for saving ---- -# adjust Pickler save to save bstr via *STRING opcodes. -# This mirrors corresponding py saving adjustments, but is more involved to implement. +# adjust Pickler save to save bstr via *STRING opcodes and handle persistent +# references via our codepath. This mirrors corresponding py saving +# adjustments, but is more involved to implement. cdef int _pickle_Pickler_xsave(PicklerObject* self, PyObject* obj, int pers_save) except -1: return __Pickler_xsave(&_pickle_patchctx, self, obj, pers_save) @@ -463,12 +514,17 @@ cdef int _pickle_Pickler_xsave(PicklerObject* self, PyObject* obj, int pers_save cdef int _zpickle_Pickler_xsave(PicklerObject* self, PyObject* obj, int pers_save) except -1: return __Pickler_xsave(&_zpickle_patchctx, self, obj, pers_save) +cdef int _pickle_Pickler_xsave_psave0(PicklerObject* self, PyObject* obj) except -1: + return __Pickler_xsave_psave0(&_pickle_patchctx, self, obj) + +cdef int _zpickle_Pickler_xsave_psave0(PicklerObject* self, PyObject* obj) except -1: + return __Pickler_xsave_psave0(&_zpickle_patchctx, self, obj) + # callconv wrappers XXX place cdef extern from *: r""" static int __pyx_f_6golang_7_golang__pickle_Pickler_xsave(PicklerObject*, PyObject*, int); static int __pyx_f_6golang_7_golang__zpickle_Pickler_xsave(PicklerObject*, PyObject*, int); - #define DEF_PICKLE_XSAVE_builtin(ccname, callconv) \ static int callconv \ _pickle_Pickler_xsave_##ccname(PicklerObject* self, PyObject* obj, int pers_save) { \ @@ -480,6 +536,19 @@ cdef extern from *: return __pyx_f_6golang_7_golang__zpickle_Pickler_xsave(self, obj, pers_save); \ } + static int __pyx_f_6golang_7_golang__pickle_Pickler_xsave_psave0(PicklerObject*, PyObject*); + static int __pyx_f_6golang_7_golang__zpickle_Pickler_xsave_psave0(PicklerObject*, PyObject*); + #define DEF_PICKLE_XSAVE_builtin_psave0(ccname, callconv) \ + static int callconv \ + _pickle_Pickler_xsave_##ccname(PicklerObject* self, PyObject* obj) { \ + return __pyx_f_6golang_7_golang__pickle_Pickler_xsave_psave0(self, obj); \ + } + #define DEF_ZPICKLE_XSAVE_builtin_psave0(ccname, callconv) \ + static int callconv \ + _zpickle_Pickler_xsave_##ccname(PicklerObject* self, PyObject* obj) { \ + return __pyx_f_6golang_7_golang__zpickle_Pickler_xsave_psave0(self, obj); \ + } + #define DEF_PICKLE_XSAVE_custom(ccname, _) \ extern "C" char _pickle_Pickler_xsave_##ccname; #define DEF_ZPICKLE_XSAVE_custom(ccname, _) \ @@ -496,7 +565,6 @@ cdef extern from *: SaveFunc{(void*)&_pickle_Pickler_xsave_##ccname, CALLCONV_##ccname}, FOR_EACH_CALLCONV(PICKLE_CC_XSAVE) }; - static std::vector _zpickle_Pickler_xsave_ccv = { #define ZPICKLE_CC_XSAVE(ccname, _, __) \ SaveFunc{(void*)&_zpickle_Pickler_xsave_##ccname, CALLCONV_##ccname}, @@ -520,12 +588,52 @@ cdef extern from *: cdef int __Pickler_xsave(_pickle_PatchCtx* pctx, PicklerObject* self, PyObject* obj, int pers_save) except -1: - # !bstr -> use builtin pickle code - if obj.ob_type != pybstr: - return save_invoke(pctx.Pickler_save_orig.addr, pctx.Pickler_save_orig.cconv, - self, obj, pers_save) + # do not rely on pers_save value and instead set .pers_func=NULL during the + # call not to let xpers_save to be entered recursively and to deactivate + # original save->pers_save codepath. See note in __detect_save_callconv + # about why pers_save value might be unreliable. + # + # we are ok to do adjust .pers_save because Pickler, from the beginning, is + # not safe to be used form multiple threads simultaneously. + ppers_func = ((self) + pctx.iPickler.off_pers_func) + pers_func = ppers_func[0] + try: + ppers_func[0] = NULL + return ___Pickler_xsave(pctx, self, obj, pers_func) + finally: + ppers_func[0] = pers_func + +# __Pickler_xsave_psave0 is used instead of __Pickler_xsave when we detected +# that original save might be compiled with pers_save const-propagated with 0. +cdef int __Pickler_xsave_psave0(_pickle_PatchCtx* pctx, PicklerObject* self, PyObject* obj) except -1: + # similarly to __Pickler_xsave set .pers_func=NULL during the call not to + # let xpers_save to be entered recursively and to deactivate original + # save->pers_save codepath. + ppers_func = ((self) + pctx.iPickler.off_pers_func) + pers_func = ppers_func[0] + try: + ppers_func[0] = NULL + return ___Pickler_xsave(pctx, self, obj, pers_func) + finally: + ppers_func[0] = pers_func + +cdef int ___Pickler_xsave(_pickle_PatchCtx* pctx, PicklerObject* self, PyObject* obj, PyObject* pers_func) except -1: + # persistent reference + if pers_func != NULL: + st = __Pickler_xsave_pers(pctx, self, obj, pers_func) + if st != 0: + return st + + # bstr + if obj.ob_type == pybstr: + return __Pickler_xsave_bstr(pctx, self, obj) - # bstr -> pickle it as *STRING + # everything else -> use builtin pickle code + return save_invoke(pctx.Pickler_save_orig.addr, pctx.Pickler_save_orig.cconv, self, obj) + + +# __Pickler_xsave_bstr saves bstr as *STRING. +cdef int __Pickler_xsave_bstr(_pickle_PatchCtx* pctx, PicklerObject* self, PyObject* obj) except -1: cdef const char* s cdef Py_ssize_t l cdef byte[5] h @@ -564,6 +672,43 @@ cdef int __Pickler_xsave(_pickle_PatchCtx* pctx, PicklerObject* self, PyObject* return 0 +# __Pickler_xsave_pers detects if obj has persistent ID and, if yes, saves it as persistent references. +# XXX explain: proto=0 UTF8-b instead of ascii and \n rejected +# XXX and exists to be able to patch save when CC does constprop +cdef int __Pickler_xsave_pers(_pickle_PatchCtx* pctx, PicklerObject* self, PyObject* obj, PyObject* pers_func) except -1: + cdef PyObject* pers_func_self = NULL + + if pctx.iPickler.off_pers_func_self != -1: + pers_func_self = (((self) + pctx.iPickler.off_pers_func_self))[0] + + pid = _call_meth(pers_func, pers_func_self, obj) + if pid is None: + return 0 + + cdef int bin = (((self) + pctx.iPickler.off_bin))[0] + if bin: + __Pickler_xsave(pctx, self, pid, 1) + __Pickler_xWrite(pctx, self, b'Q', 1) # BINPERSID + + else: + pid_str = pybstr(pid) + if b'\n' in pid_str: + raise (pctx.pymod).PicklingError(r'persistent ID contains \n') + s = PyBytes_AS_STRING(pid_str) + l = PyBytes_GET_SIZE(pid_str) + __Pickler_xWrite(pctx, self, b'P', 1) # PERSID + __Pickler_xWrite(pctx, self, s, l) + __Pickler_xWrite(pctx, self, b'\n', 1) + + return 1 + +# _call_meth invokes func(self, obj) or func(obj) if self is NULL. +cdef object _call_meth(PyObject* func, PyObject* self, PyObject* obj): + if self != NULL: + return PyObject_CallFunctionObjArgs(func, self, obj, NULL) + return PyObject_CallObject(func, (obj,)) # XXX PyObject_CallOneArg on py3 + + # __Pickler_xWrite mimics original _Pickler_Write. # @@ -607,7 +752,7 @@ cdef int __Pickler_xWrite(_pickle_PatchCtx* pctx, PicklerObject* self, const cha # _detect_Pickler_typeinfo detects information about PicklerObject type # through runtime introspection. # -# This information is used mainly by __Pickler_xWrite. +# This information is used mainly by __Pickler_xWrite and __Pickler_xsave_pers. cdef PicklerTypeInfo _detect_Pickler_typeinfo(pyPickler) except *: cdef PicklerTypeInfo t @@ -805,6 +950,65 @@ cdef PicklerTypeInfo _detect_Pickler_typeinfo(pyPickler) except *: markbusy(t.off_max_output_len, sizeof(Py_ssize_t)) trace(".max_output_len:\t", t.off_max_output_len) + # .pers_func + # set .persistent_id to known function and find that pointers + obj_copy() + def persid_func(obj): pass + pyobj.persistent_id = persid_func + dpersid_func = obj_diff(sizeof(PyObject*)) + assert len(dpersid_func) == 1, dpersid_func + t.off_pers_func = dpersid_func[0] + assert ((bobj + t.off_pers_func))[0] == persid_func + markbusy(t.off_pers_func, sizeof(PyObject*)) + trace('.pers_func:\t', t.off_pers_func) + + # .pers_func_self + # start with class that defines .persistent_id methond, then set .persistent_id + # to known function and find which pointers change: + # * if it is only 1 pointer - there is no .pers_func_self (e.g. zodbpickle) + # * if it is 2 pointers - .pers_func_self is there and it is reset to NULL + class pyPickler2(pyPickler): + def persistent_id(self, obj): pass + assert isinstance(pyPickler2, type) + cdef PyTypeObject* Pickler2 = pyPickler2 + cdef _XPyTypeObject* xPickler2 = <_XPyTypeObject*> pyPickler2 + + assert Pickler2.tp_basicsize >= t.size + assert Pickler2.tp_itemsize == 0 + + pyobj = pyPickler2(Null()) + obj = pyobj + assert obj.ob_type == Pickler2 + bobj = obj + + obj_copy() + pyPickler.persistent_id.__set__(pyobj, persid_func) + dpersid_meth = obj_diff(sizeof(PyObject*)) + assert len(dpersid_meth) in (1,2), dpersid_meth + cdef Py_ssize_t off1, off2 + if len(dpersid_meth) == 1: + t.off_pers_func_self = -1 + assert dpersid_meth[0] == t.off_pers_func + assert ((bobj + t.off_pers_func))[0] == persid_func + else: + assert len(dpersid_meth) == 2 + off1 = (dpersid_meth[0]) + off2 = (dpersid_meth[1]) + val1 = ((bobj + off1))[0] + val2 = ((bobj + off2))[0] + if val1 == NULL: + assert off2 == t.off_pers_func + assert val2 == persid_func + t.off_pers_func_self = off1 + elif val2 == NULL: + assert off1 == t.off_pers_func + assert val1 == persid_func + t.off_pers_func_self = off2 + else: + assert False, "cannot find NULL after resetting .pers_func_self" + markbusy(t.off_pers_func_self, sizeof(PyObject*)) + trace('.pers_func_self:\t', t.off_pers_func_self) + free(bobj2) return t @@ -931,6 +1135,22 @@ cdef extern from * nogil: # see _golang_str_pickle.S for details # convention is usually the same as default, but on e.g. i386 - where the # default cdecl means to put arguments on the stack, the compiler usually # changes calling convention to use registers instead. +# +# It might be also the case that the code is generated with const-propagated +# pers_save=0 so save becomes a function with 2 arguments instead of 3. Such +# variants are also probed, and if we see that 2-args probe worked ok, we do not +# delve into proving whether pers_save was really const-propagated or not: even +# if it is not const-propagated __Pickler_xsave_psave0 deactivates original +# save->pers_save codepath so the worst that can happen is that we ignore +# pers_save argument passed in a register or on the stack. We are ok to do that +# because we let the probe go only if stkclean_by_callee is the same for both +# save and probe, and because original code passes pers_save=0 all around +# except from inside pers_save which we deactivate. +# +# Note that regarding pers_save the detection of calling convention is not +# reliable because save is invoked with pers_save=0 and zeros might be present +# in a register or on the stack for unrelated reason. For this reason +# __Pickler_xsave does not rely on pers_save value at all in its control flow. cdef Callconv __detect_save_callconv(pyPickler, void* save) except *: for p in saveprobe_test_ccv: #print("save: probing %s" % callconv_str(p.cconv)) @@ -1001,6 +1221,11 @@ cdef extern from * nogil: saveprobe_##ccname(void* self, PyObject* obj, int pers_save) { \ return saveprobe(self, obj, pers_save); \ } + #define DEF_SAVEPROBE_builtin_psave0(ccname, callconv) \ + static int callconv \ + saveprobe_##ccname(void* self, PyObject* obj) { \ + return saveprobe(self, obj, 0); \ + } #define DEF_SAVEPROBE_custom(ccname, _) \ extern "C" char saveprobe_##ccname; #define DEF_SAVEPROBE(ccname, callconv, cckind) DEF_SAVEPROBE_##cckind(ccname, callconv) @@ -1028,20 +1253,28 @@ cdef extern from * nogil: vector[SaveFunc] saveprobe_test_ccv -# XXX doc save_invoke ... +# XXX doc save_invoke pers_save=1 ... # XXX place cdef extern from *: r""" #define CC_SAVE_DEFCALL1_builtin(ccname, callconv) + #define CC_SAVE_DEFCALL1_builtin_psave0(ccname, callconv) #define CC_SAVE_DEFCALL1_custom(ccname, _) \ extern "C" int CALLCONV(fastcall) \ save_invoke_as_##ccname(void* save, void* self, PyObject* obj, int pers_save); #define CC_SAVE_DEFCALL1(ccname, callconv, cckind) CC_SAVE_DEFCALL1_##cckind(ccname, callconv) FOR_EACH_CALLCONV(CC_SAVE_DEFCALL1) - static int save_invoke(void* save, Callconv cconv, void* self, PyObject* obj, int pers_save) { + static int save_invoke(void* save, Callconv cconv, void* self, PyObject* obj) { using namespace golang; + // passing pers_save is unreliable and we anyway always deactivate + // original save->pers_save codepath and handle persistent references + // ourselves. But try to deactivate it here once more just in case. + // + // See __Pickler_xsave and note in __detect_save_callconv for details. + int pers_save = 1; + switch(cconv) { #define CC_SAVE_CALL1_builtin(ccname, callconv) \ case CALLCONV_ ## ccname: \ @@ -1050,6 +1283,10 @@ cdef extern from *: #define CC_SAVE_CALL1_custom(ccname, _) \ case CALLCONV_ ## ccname: \ return save_invoke_as_##ccname(save, self, obj, pers_save); + #define CC_SAVE_CALL1_builtin_psave0(ccname, callconv) \ + case CALLCONV_ ## ccname: \ + return ((int (callconv *)(void*, PyObject*))save) \ + (self, obj); #define CC_SAVE_CALL1(ccname, callconv, cckind) CC_SAVE_CALL1_##cckind(ccname, callconv) FOR_EACH_CALLCONV(CC_SAVE_CALL1) default: @@ -1057,7 +1294,7 @@ cdef extern from *: } } """ - int save_invoke(void* save, Callconv cconv, void* self, PyObject* obj, int pers_save) except -1 + int save_invoke(void* save, Callconv cconv, void* self, PyObject* obj) except -1 # - cfunc_direct_callees returns addresses of functions that cfunc calls directly. diff --git a/golang/_golang_str_pickle_test.pyx b/golang/_golang_str_pickle_test.pyx index 62c9a2f..b041974 100644 --- a/golang/_golang_str_pickle_test.pyx +++ b/golang/_golang_str_pickle_test.pyx @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2023 Nexedi SA and Contributors. -# Kirill Smelkov +# Copyright (C) 2023-2024 Nexedi SA and Contributors. +# Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute # it under the terms of the GNU General Public License version 3, or (at your @@ -102,16 +102,16 @@ cdef extern from * nogil: int CALLCONV(fastcall) tfunc_fastcall3(int x, int y, int z) { return x; } - #ifndef LIBGOLANG_CC_msc // see note about C3865 in FOR_EACH_CALLCONV + # ifndef LIBGOLANG_CC_msc // see note about C3865 in FOR_EACH_CALLCONV int CALLCONV(thiscall) tfunc_thiscall1(int x) { return x; } int CALLCONV(thiscall) tfunc_thiscall2(int x, int y) { return x; } int CALLCONV(thiscall) tfunc_thiscall3(int x, int y, int z) { return x; } - #endif + # endif - #ifndef LIBGOLANG_CC_msc // no regparm on MSCV + # ifndef LIBGOLANG_CC_msc // no regparm on MSVC int CALLCONV(regparm(1)) tfunc_regparm1_1(int x) { return x; } int CALLCONV(regparm(1)) @@ -132,7 +132,7 @@ cdef extern from * nogil: tfunc_regparm3_2(int x, int y) { return x; } int CALLCONV(regparm(3)) tfunc_regparm3_3(int x, int y, int z) { return x; } - #endif + # endif static std::vector<_Test_cfunc_is_callee_clenup> _cfunc_is_callee_cleanup_testv = { CASE(tfunc_cdecl1 , 0 * 4), @@ -144,12 +144,12 @@ cdef extern from * nogil: CASE(tfunc_fastcall1 , 0 * 4), CASE(tfunc_fastcall2 , 0 * 4), CASE(tfunc_fastcall3 , 1 * 4), - #ifndef LIBGOLANG_CC_msc + # ifndef LIBGOLANG_CC_msc CASE(tfunc_thiscall1 , 0 * 4), CASE(tfunc_thiscall2 , 1 * 4), CASE(tfunc_thiscall3 , 2 * 4), - #endif - #ifndef LIBGOLANG_CC_msc + # endif + # ifndef LIBGOLANG_CC_msc CASE(tfunc_regparm1_1 , 0 * 4), CASE(tfunc_regparm1_2 , 0 * 4), CASE(tfunc_regparm1_3 , 0 * 4), @@ -159,7 +159,7 @@ cdef extern from * nogil: CASE(tfunc_regparm3_1 , 0 * 4), CASE(tfunc_regparm3_2 , 0 * 4), CASE(tfunc_regparm3_3 , 0 * 4), - #endif + # endif }; #else diff --git a/golang/_gopath.py b/golang/_gopath.py index 8f34b33..d5e1f2a 100644 --- a/golang/_gopath.py +++ b/golang/_gopath.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2019 Nexedi SA and Contributors. +# Copyright (C) 2018-2024 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -34,11 +34,7 @@ import os, os.path import sys - -import warnings -with warnings.catch_warnings(): - warnings.simplefilter('ignore', DeprecationWarning) - import imp +import six # _gopathv returns $GOPATH vector. def _gopathv(): @@ -51,11 +47,25 @@ def _gopathv(): # gimport imports python module or package from fully-qualified module name under $GOPATH. def gimport(name): - imp.acquire_lock() + _gimport_lock() try: return _gimport(name) finally: - imp.release_lock() + _gimport_unlock() + +# on py2 there is global import lock +# on py3 we need to organize our own gimport synchronization +if six.PY2: + import imp + _gimport_lock = imp.acquire_lock + _gimport_unlock = imp.release_lock +else: + from importlib import machinery as imp_machinery + from importlib import util as imp_util + from golang import sync + _gimport_mu = sync.Mutex() + _gimport_lock = _gimport_mu.lock + _gimport_unlock = _gimport_mu.unlock def _gimport(name): # we will register imported module into sys.modules with adjusted path. @@ -93,4 +103,16 @@ def _gimport(name): # https://stackoverflow.com/a/67692 - return imp.load_source(modname, modpath) + return _imp_load_source(modname, modpath) + +def _imp_load_source(modname, modpath): + if six.PY2: + return imp.load_source(modname, modpath) + + # https://docs.python.org/3/whatsnew/3.12.html#imp + loader = imp_machinery.SourceFileLoader(modname, modpath) + spec = imp_util.spec_from_file_location(modname, modpath, loader=loader) + mod = imp_util.module_from_spec(spec) + sys.modules[modname] = mod + loader.exec_module(mod) + return mod diff --git a/golang/_strconv.pyx b/golang/_strconv.pyx index 3b1db0c..03c7528 100644 --- a/golang/_strconv.pyx +++ b/golang/_strconv.pyx @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # cython: language_level=2 -# Copyright (C) 2018-2023 Nexedi SA and Contributors. +# Copyright (C) 2018-2024 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute diff --git a/golang/fmt.h b/golang/fmt.h index 7c33802..a039529 100644 --- a/golang/fmt.h +++ b/golang/fmt.h @@ -1,7 +1,7 @@ #ifndef _NXD_LIBGOLANG_FMT_H #define _NXD_LIBGOLANG_FMT_H -// Copyright (C) 2019-2023 Nexedi SA and Contributors. +// Copyright (C) 2019-2024 Nexedi SA and Contributors. // Kirill Smelkov // // This program is free software: you can Use, Study, Modify and Redistribute diff --git a/golang/golang_str_pickle_test.py b/golang/golang_str_pickle_test.py index 1bf1a7b..c782fd7 100644 --- a/golang/golang_str_pickle_test.py +++ b/golang/golang_str_pickle_test.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2022-2023 Nexedi SA and Contributors. +# Copyright (C) 2022-2024 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -70,30 +70,30 @@ def pickle2tools(pickle): # ---- pickling/unpickling under gpystr ---- -# verify that loading *STRING opcodes loads them as bstr on gpython by default. -# TODO or with encoding='bstr' under plain py -@gpystr_only -def test_string_pickle_load_STRING(pickle): - p_str = b"S'\\xd0\\xbc\\xd0\\xb8\\xd1\\x80\\xff'\n." # STRING 'мир\xff' - p_utf8 = b"S'"+xbytes('мир')+b"\\xff'\n." # STRING 'мир\xff' - p_sbins = b'U\x07\xd0\xbc\xd0\xb8\xd1\x80\xff.' # SHORT_BINSTRING 'мир\xff' - p_bins = b'T\x07\x00\x00\x00\xd0\xbc\xd0\xb8\xd1\x80\xff.' # BINSTRING 'мир\xff' +# test pickles with *STRING +STRING_bytes = xbytes('мир')+b'\xff' # binary data in all test *STRING pickles +p_str = b"S'\\xd0\\xbc\\xd0\\xb8\\xd1\\x80\\xff'\n." # STRING 'мир\xff' +p_utf8 = b"S'"+xbytes('мир')+b"\\xff'\n." # STRING 'мир\xff' +p_sbins = b'U\x07\xd0\xbc\xd0\xb8\xd1\x80\xff.' # SHORT_BINSTRING 'мир\xff' +p_bins = b'T\x07\x00\x00\x00\xd0\xbc\xd0\xb8\xd1\x80\xff.' # BINSTRING 'мир\xff' - p_bytes = xbytes('мир')+b'\xff' +# checkSTRING invokes f on all test *STRING pickles. +def checkSTRING(f): + f(p_str) + f(p_utf8) + f(p_sbins) + f(p_bins) - # check invokes f on all test pickles - def check(f): - f(p_str) - f(p_utf8) - f(p_sbins) - f(p_bins) +# verify that loading *STRING opcodes loads them as bstr on gpython by default. +@gpystr_only +def test_strings_pickle_load_STRING(pickle): + check = checkSTRING # default -> bstr on both py2 and py3 - # TODO only this check is gpystr_only -> remove whole-func @gpystr_only def _(p): obj = xloads(pickle, p) assert type(obj) is bstr - assert obj == p_bytes + assert obj == STRING_bytes check(_) # also test bstr inside tuple (for symmetry with save) @@ -104,49 +104,34 @@ def _(p): assert len(tobj) == 1 obj = tobj[0] assert type(obj) is bstr - assert obj == p_bytes + assert obj == STRING_bytes check(_) - # pickle supports encoding=... only on py3 - if six.PY3: - # encoding='bstr' -> bstr - def _(p): - obj = xloads(pickle, p, encoding='bstr') - assert type(obj) is bstr - assert obj == p_bytes - check(_) - - # encoding='bytes' -> bytes - def _(p): - obj = xloads(pickle, p, encoding='bytes') - assert type(obj) is bytes - assert obj == p_bytes - check(_) - - # encoding='utf-8' -> UnicodeDecodeError - def _(p): - with raises(UnicodeDecodeError): - xloads(pickle, p, encoding='utf-8') - check(_) - - # encoding='utf-8', errors=... -> unicode - def _(p): - obj = xloads(pickle, p, encoding='utf-8', errors='backslashreplace') - assert type(obj) is unicode - assert obj == u'мир\\xff' - check(_) - + # also test bstr used as persistent reference directly and as part of tuple (symmetry with save) + def _(p): + p_ = p[:-1] + b'Q.' + pobj = ploads(pickle, p_) + assert type(pobj) is tPersistent + assert type(pobj._p_oid) is bstr + assert pobj._p_oid == STRING_bytes + check(_) + def _(p): + p_ = b'(' + p[:-1] + b'tQ.' + pobj = ploads(pickle, p_) + assert type(pobj) is tPersistent + assert type(pobj._p_oid) is tuple + assert len(pobj._p_oid) == 1 + obj = pobj._p_oid[0] + assert type(obj) is bstr + assert obj == STRING_bytes + check(_) # verify that saving bstr results in *STRING opcodes on gpython. @gpystr_only def test_strings_pickle_save_STRING(pickle): - s = s0 = b(xbytes('мир')+b'\xff') + s = s0 = b(STRING_bytes) assert type(s) is bstr - p_utf8 = b"S'"+xbytes('мир')+b"\\xff'\n." # STRING 'мир\xff' - p_sbins = b'U\x07\xd0\xbc\xd0\xb8\xd1\x80\xff.' # SHORT_BINSTRING 'мир\xff' - p_bins = b'T\x07\x00\x00\x00\xd0\xbc\xd0\xb8\xd1\x80\xff.' # BINSTRING 'мир\xff' - def dumps(proto): return xdumps(pickle, s, proto) @@ -163,18 +148,84 @@ def dumps(proto): # also test bstr inside tuple to verify that what we patched is actually # _pickle.save that is invoked from inside other save_X functions. s = (s0,) - p_tutf8 = b'(' + p_utf8[:-1] + b't.' - p_tsbins = b'(' + p_sbins[:-1] + b't.' - assert dumps(0) == p_tutf8 - assert dumps(1) == p_tsbins + p_tuple_utf8 = b'(' + p_utf8[:-1] + b't.' + p_tuple_sbins = b'(' + p_sbins[:-1] + b't.' + assert dumps(0) == p_tuple_utf8 + assert dumps(1) == p_tuple_sbins # don't test proto ≥ 2 because they start to use TUPLE1 instead of TUPLE + # also test bstr used as persistent reference to verify pers_save codepath + obj = tPersistent(s0) + def dumps(proto): + return pdumps(pickle, obj, proto) + assert dumps(0) == b'P' + STRING_bytes + '\n.' + for proto in range(1, HIGHEST_PROTOCOL(pickle)+1): + assert dumps(proto) == p_sbins[:-1] + b'Q.' + + # ... and peristent reference being tuple to verifiy pers_save + # stringification in proto=0 and recursion to save in proto≥1. + obj = tPersistent((s0,)) + try: + assert dumps(0) == b'P(' + p_utf8[1:-2] + ',)\n.' + except pickle.PicklingError as e: + # on py2 cpickle insists that with proto=0 pid must be string + if six.PY2: + assert e.args == ('persistent id must be string',) + else: + raise + assert dumps(1) == p_tuple_sbins[:-1] + b'Q.' + # no proto ≥ 2 because they start to use TUPLE1 instead of TUPLE + + # proto 0 with \n in persid -> rejected + obj = tPersistent(b('a\nb')) + if six.PY3: # TODO also consider patching save_pers codepath on py2 + with raises(pickle.PicklingError, match=r'persistent ID contains \\n') as e: + dumps(0) + for proto in range(1, HIGHEST_PROTOCOL(pickle)+1): + assert dumps(proto) == b'U\x03a\nbQ.' + + +# verify that unpickling handles encoding=bstr|* . +# TODO also handle encoding='bstr' under plain py +@mark.skipif(not six.PY3, reason="pickle supports encoding=... only on py3") +@gpystr_only +def test_strings_pickle_load_encoding(pickle): + check = checkSTRING + + # encoding='bstr' -> bstr + def _(p): + obj = xloads(pickle, p, encoding='bstr') + assert type(obj) is bstr + assert obj == STRING_bytes + check(_) + + # encoding='bytes' -> bytes + def _(p): + obj = xloads(pickle, p, encoding='bytes') + assert type(obj) is bytes + assert obj == STRING_bytes + check(_) + + # encoding='utf-8' -> UnicodeDecodeError + def _(p): + with raises(UnicodeDecodeError): + xloads(pickle, p, encoding='utf-8') + check(_) + + # encoding='utf-8', errors=... -> unicode + def _(p): + obj = xloads(pickle, p, encoding='utf-8', errors='backslashreplace') + assert type(obj) is unicode + assert obj == u'мир\\xff' + check(_) + + # verify that loading *UNICODE opcodes loads them as unicode/ustr. # this is standard behaviour but we verify it since we patch pickle's strings processing. # also verify save lightly for symmetry. # NOTE not @gpystr_only -def test_string_pickle_loadsave_UNICODE(pickle): +def test_strings_pickle_loadsave_UNICODE(pickle): # NOTE builtin pickle behaviour is to save unicode via 'surrogatepass' error handler # this means that b'мир\xff' -> ustr/unicode -> save will emit *UNICODE with # b'мир\xed\xb3\xbf' instead of b'мир\xff' as data. @@ -263,7 +314,7 @@ def assert_pickle(obj, proto, dumps_ok_gpystr, dumps_ok_stdstr): b'cgolang\nbstr\n(X\x09\x00\x00\x00' # bstr(BINUNICODE) b'\xd0\xbc\xd0\xb8\xd1\x80\xed\xb3\xbftR.') - # NOTE BINUNICODE ...edb3bf not ...ff (see test_string_pickle_loadsave_UNICODE for details) + # NOTE BINUNICODE ...edb3bf not ...ff (see test_strings_pickle_loadsave_UNICODE for details) _(us, 1, b'X\x09\x00\x00\x00\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbf.', # BINUNICODE b'cgolang\nustr\n(X\x09\x00\x00\x00' # bstr(BINUNICODE) b'\xd0\xbc\xd0\xb0\xd0\xb9\xed\xb3\xbftR.') @@ -302,38 +353,48 @@ def xdiss(pickletools, p): # -> str pickletools.dis(p, out) return out.getvalue() -# verify that disassembling *STRING opcodes works with treating strings as UTF8b. +# verify that disassembling *STRING and related opcodes works with treating strings as UTF8b. @gpystr_only -def test_string_pickle_dis_STRING(pickletools): - p_str = b"S'\\xd0\\xbc\\xd0\\xb8\\xd1\\x80'\n." # STRING 'мир' - p_sbins = b'U\x06\xd0\xbc\xd0\xb8\xd1\x80.' # SHORT_BINSTRING 'мир' - p_bins = b'T\x06\x00\x00\x00\xd0\xbc\xd0\xb8\xd1\x80.' # BINSTRING 'мир' - - bmir = x32("b('мир')", "'мир'") +def test_strings_pickle_dis_STRING(pickletools): + brepr = repr(b(STRING_bytes)) assert xdiss(pickletools, p_str) == """\ 0: S STRING %s - 28: . STOP + 32: . STOP highest protocol among opcodes = 0 -""" % bmir +""" % brepr + + assert xdiss(pickletools, p_utf8) == """\ + 0: S STRING %s + 14: . STOP +highest protocol among opcodes = 0 +""" % brepr assert xdiss(pickletools, p_sbins) == """\ 0: U SHORT_BINSTRING %s - 8: . STOP + 9: . STOP highest protocol among opcodes = 1 -""" % bmir +""" % brepr assert xdiss(pickletools, p_bins) == """\ 0: T BINSTRING %s - 11: . STOP + 12: . STOP highest protocol among opcodes = 1 -""" % bmir +""" % brepr + + assert xdiss(pickletools, b'P' + STRING_bytes + b'\n.') == """\ + 0: P PERSID %s + 9: . STOP +highest protocol among opcodes = 0 +""" % brepr # ---- loads and normalized dumps ---- # xloads loads pickle p via pickle.loads # it also verifies that .load and Unpickler.load give the same result. +# +# see also: ploads. def xloads(pickle, p, **kw): obj1 = _xpickle_attr(pickle, 'loads')(p, **kw) obj2 = _xpickle_attr(pickle, 'load') (io.BytesIO(p), **kw) @@ -346,6 +407,8 @@ def xloads(pickle, p, **kw): # xdumps dumps obj via pickle.dumps # it also verifies that .dump and Pickler.dump give the same. # the pickle is returned in normalized form - see pickle_normalize for details. +# +# see also: pdumps. def xdumps(pickle, obj, proto, **kw): p1 = _xpickle_attr(pickle, 'dumps')(obj, proto, **kw) f2 = io.BytesIO(); _xpickle_attr(pickle, 'dump')(obj, f2, proto, **kw) @@ -359,10 +422,85 @@ def xdumps(pickle, obj, proto, **kw): # remove not interesting parts: PROTO / FRAME header and unused PUTs if proto >= 2: - protover = PROTO(proto) - assert p1.startswith(protover) + assert p1.startswith(PROTO(proto)) return pickle_normalize(pickle2tools(pickle), p1) +# ploads loads pickle p via pickle.Unpickler with handling persistent references. +# +# see also: xloads. +def ploads(pickle, p, **kw): + Unpickler = _xpickle_attr(pickle, 'Unpickler') + + u1 = Unpickler(io.BytesIO(p), **kw) + u1.persistent_load = lambda pid: tPersistent(pid) + obj1 = u1.load() + + # same with .persistent_load defined as class method + try: + class Unpickler2(Unpickler): + def persistent_load(self, pid): return tPersistent(pid) + except TypeError: + if six.PY2: + # on py2 cPickle.Unpickler is not subclassable at all + obj2 = obj1 + else: + raise + else: + u2 = Unpickler2(io.BytesIO(p), **kw) + obj2 = u2.load() + + assert obj1 == obj2 + return obj1 + +# pdumps dumps obj via pickle.Pickler with handling persistent references. +# the pickle is returned in normalized form - see pickle_normalize for details. +# +# see also: xdumps. +def pdumps(pickle, obj, proto, **kw): + Pickler = _xpickle_attr(pickle, 'Pickler') + + f1 = io.BytesIO() + p1 = Pickler(f1, proto, **kw) + def _(obj): + if isinstance(obj, tPersistent): + return obj._p_oid + return None + p1.persistent_id = _ + p1.dump(obj) + pobj1 = f1.getvalue() + + # same with .persistent_id defined as class method + try: + class Pickler2(Pickler): + def persistent_id(self, obj): + if isinstance(obj, tPersistent): + return obj._p_oid + return None + except TypeError: + if six.PY2: + # on py2 cPickle.Pickler is not subclassable at all + pobj2 = pobj1 + else: + raise + else: + f2 = io.BytesIO() + p2 = Pickler2(f2, proto, **kw) + p2.dump(obj) + pobj2 = f2.getvalue() + + assert pobj1 == pobj2 + + if proto >= 2: + assert pobj1.startswith(PROTO(proto)) + return pickle_normalize(pickle2tools(pickle), pobj1) + +# tPersistent is test class to verify handling of persistent references. +class tPersistent(object): + def __init__(t, pid): + t._p_oid = pid + def __eq__(t, rhs): return (type(rhs) is type(t)) and (rhs._p_oid == t._p_oid) + def __ne__(t, rhs): return not (t.__eq__(rhs)) + def _xpickle_attr(pickle, name): # on py3 pickle.py tries to import from C _pickle to optimize by default # -> verify py version if we are asked to test pickle.py diff --git a/golang/golang_str_test.py b/golang/golang_str_test.py index 0692de7..975584a 100644 --- a/golang/golang_str_test.py +++ b/golang/golang_str_test.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Nexedi SA and Contributors. +# Copyright (C) 2018-2024 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -26,6 +26,7 @@ from golang.gcompat import qq from golang.strconv_test import byterange from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE +from gpython import _tEarlyStrSubclass from pytest import raises, mark, skip import sys import six @@ -2558,6 +2559,50 @@ def _(delta): assert _(b'cde') == b'abcde' +# verify that str subclasses, created before str/unicode are replaced with +# bstr/ustr, continue to work ok. +# +# Even though we try to patch string types early, there are always some str +# subclasses created by builtin modules before golang is loaded. For example +# enum.StrEnum is created early during python startup process via +# pathlib -> fnmatch -> re -> enum import. So if we don't preserve those +# classes to continue to work correctly things are breaking badly. +# +# XXX note !gpystr_only ... +# XXX also test bytes? +def tests_strings_early_str_subclass(): + xstr = _tEarlyStrSubclass + + # .tp_new should be adjusted to point to current str + # (else str.__new__ breaks with "str.__new__(xstr) is not safe ...") + obj = str.__new__(xstr, 'abc') + assert type(obj) is xstr + assert obj == 'abc' + assert xstr.__new__ is str.__new__ + + # follow-up .__init__ should be noop (enum uses str.__init__ for real) + obj.__init__('xyz') + assert obj == 'abc' + assert str.__init__ is object.__init__ + assert xstr.__init__ is str.__init__ + + + # XXX place + assert xstr.__base__ is str + assert xstr.__bases__ == (str,) + + # XXX __bases__ + __mro__ for MI + + + """ + assert str.__base__ is object + assert str.__bases__ == (object,) + """ + + + # XXX more... + + # ---- benchmarks ---- # utf-8 decoding diff --git a/golang/golang_test.py b/golang/golang_test.py index 0b6b9cb..46d6e95 100644 --- a/golang/golang_test.py +++ b/golang/golang_test.py @@ -1682,6 +1682,12 @@ def test_defer_excchain_dump_ipython(): # ----//---- (pytest) def test_defer_excchain_dump_pytest(): + # pytest 7.4 also changed traceback output format + # similarly to ipython we do not need to test it becase we activate + # pytest-related patch only on py2 for which latest pytest version is 4.6.11 . + import pytest + if six.PY3 and pytest.version_tuple >= (7,4): + skip("pytest is patched only on py2; pytest7.4 changed traceback format") tbok = readfile(dir_testprog + "/golang_test_defer_excchain.txt-pytest") retcode, stdout, stderr = _pyrun([ # don't let pytest emit internal deprecation warnings to stderr diff --git a/golang/libgolang.h b/golang/libgolang.h index 53a8aec..4131a84 100644 --- a/golang/libgolang.h +++ b/golang/libgolang.h @@ -1,7 +1,7 @@ #ifndef _NXD_LIBGOLANG_H #define _NXD_LIBGOLANG_H -// Copyright (C) 2018-2023 Nexedi SA and Contributors. +// Copyright (C) 2018-2024 Nexedi SA and Contributors. // Kirill Smelkov // // This program is free software: you can Use, Study, Modify and Redistribute diff --git a/golang/os.cpp b/golang/os.cpp index a7c7f2a..6c08fdf 100644 --- a/golang/os.cpp +++ b/golang/os.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2023 Nexedi SA and Contributors. +// Copyright (C) 2019-2024 Nexedi SA and Contributors. // Kirill Smelkov // // This program is free software: you can Use, Study, Modify and Redistribute diff --git a/golang/os.h b/golang/os.h index 9ad0c99..1d79d05 100644 --- a/golang/os.h +++ b/golang/os.h @@ -1,7 +1,7 @@ #ifndef _NXD_LIBGOLANG_OS_H #define _NXD_LIBGOLANG_OS_H // -// Copyright (C) 2019-2023 Nexedi SA and Contributors. +// Copyright (C) 2019-2024 Nexedi SA and Contributors. // Kirill Smelkov // // This program is free software: you can Use, Study, Modify and Redistribute diff --git a/golang/os/signal.cpp b/golang/os/signal.cpp index 793e7a4..0677721 100644 --- a/golang/os/signal.cpp +++ b/golang/os/signal.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2021-2023 Nexedi SA and Contributors. +// Copyright (C) 2021-2024 Nexedi SA and Contributors. // Kirill Smelkov // // This program is free software: you can Use, Study, Modify and Redistribute diff --git a/golang/pyx/build.py b/golang/pyx/build.py index 3c15f22..0079dd0 100644 --- a/golang/pyx/build.py +++ b/golang/pyx/build.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2023 Nexedi SA and Contributors. +# Copyright (C) 2019-2024 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute diff --git a/golang/runtime.cpp b/golang/runtime.cpp index 0fc63e6..dd398d4 100644 --- a/golang/runtime.cpp +++ b/golang/runtime.cpp @@ -1,5 +1,5 @@ -// Copyright (C) 2023 Nexedi SA and Contributors. -// Kirill Smelkov +// Copyright (C) 2023-2024 Nexedi SA and Contributors. +// Kirill Smelkov // // This program is free software: you can Use, Study, Modify and Redistribute // it under the terms of the GNU General Public License version 3, or (at your diff --git a/golang/runtime.h b/golang/runtime.h index 60b5765..4eecfc2 100644 --- a/golang/runtime.h +++ b/golang/runtime.h @@ -1,8 +1,8 @@ #ifndef _NXD_LIBGOLANG_RUNTIME_H #define _NXD_LIBGOLANG_RUNTIME_H -// Copyright (C) 2023 Nexedi SA and Contributors. -// Kirill Smelkov +// Copyright (C) 2023-2024 Nexedi SA and Contributors. +// Kirill Smelkov // // This program is free software: you can Use, Study, Modify and Redistribute // it under the terms of the GNU General Public License version 3, or (at your diff --git a/golang/runtime/internal/atomic.cpp b/golang/runtime/internal/atomic.cpp index 2669714..a8b57da 100644 --- a/golang/runtime/internal/atomic.cpp +++ b/golang/runtime/internal/atomic.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2022-2023 Nexedi SA and Contributors. +// Copyright (C) 2022-2024 Nexedi SA and Contributors. // Kirill Smelkov // // This program is free software: you can Use, Study, Modify and Redistribute diff --git a/golang/runtime/internal/syscall.cpp b/golang/runtime/internal/syscall.cpp index 4602c0a..429545a 100644 --- a/golang/runtime/internal/syscall.cpp +++ b/golang/runtime/internal/syscall.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2021-2023 Nexedi SA and Contributors. +// Copyright (C) 2021-2024 Nexedi SA and Contributors. // Kirill Smelkov // // This program is free software: you can Use, Study, Modify and Redistribute diff --git a/golang/runtime/internal/syscall.h b/golang/runtime/internal/syscall.h index 4771a19..204c5b8 100644 --- a/golang/runtime/internal/syscall.h +++ b/golang/runtime/internal/syscall.h @@ -1,7 +1,7 @@ #ifndef _NXD_LIBGOLANG_RUNTIME_INTERNAL_SYSCALL_H #define _NXD_LIBGOLANG_RUNTIME_INTERNAL_SYSCALL_H -// Copyright (C) 2021-2023 Nexedi SA and Contributors. +// Copyright (C) 2021-2024 Nexedi SA and Contributors. // Kirill Smelkov // // This program is free software: you can Use, Study, Modify and Redistribute diff --git a/golang/runtime/libgolang.cpp b/golang/runtime/libgolang.cpp index 3714cc7..f91772a 100644 --- a/golang/runtime/libgolang.cpp +++ b/golang/runtime/libgolang.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2023 Nexedi SA and Contributors. +// Copyright (C) 2018-2024 Nexedi SA and Contributors. // Kirill Smelkov // // This program is free software: you can Use, Study, Modify and Redistribute diff --git a/golang/runtime/platform.h b/golang/runtime/platform.h index 8def7e7..f8fa3ea 100644 --- a/golang/runtime/platform.h +++ b/golang/runtime/platform.h @@ -1,8 +1,8 @@ #ifndef _NXD_LIBGOLANG_RUNTIME_PLATFORM_H #define _NXD_LIBGOLANG_RUNTIME_PLATFORM_H -// Copyright (C) 2023 Nexedi SA and Contributors. -// Kirill Smelkov +// Copyright (C) 2023-2024 Nexedi SA and Contributors. +// Kirill Smelkov // // This program is free software: you can Use, Study, Modify and Redistribute // it under the terms of the GNU General Public License version 3, or (at your diff --git a/gpython/__init__.py b/gpython/__init__.py index f2225f8..980f964 100755 --- a/gpython/__init__.py +++ b/gpython/__init__.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Nexedi SA and Contributors. +# Copyright (C) 2018-2024 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -247,11 +247,12 @@ def run(mmain): pyimpl = platform.python_implementation() v = _version_info_str + pyver = platform.python_version() # ~ v(sys.version_info) but might also have e.g. '+' at tail if pyimpl == 'CPython': - ver.append('CPython %s' % v(sys.version_info)) + ver.append('CPython %s' % pyver) elif pyimpl == 'PyPy': ver.append('PyPy %s' % v(sys.pypy_version_info)) - ver.append('Python %s' % v(sys.version_info)) + ver.append('Python %s' % pyver) else: ver = [] # unknown @@ -474,6 +475,7 @@ def init(): from six.moves import builtins for k in golang.__all__: setattr(builtins, k, getattr(golang, k)) +# setattr(builtins, 'CCC', CCC) # XXX kill # sys.version sys.version += (' [GPython %s] [runtime %s] [strings %s]' % (golang.__version__, gpy_runtime_ver, gpy_strings)) @@ -594,8 +596,8 @@ def __next__(self): next = __next__ # for py2 -# for tests XXX continue by first writing test XXX -#1/0 +# for tests: subclass of str that is created before everything else is imported +# and before golang patches builtin str/unicode types. class _tEarlyStrSubclass(str): pass diff --git a/gpython/_gpython.pyx b/gpython/_gpython.pyx index ada1df8..df49691 100644 --- a/gpython/_gpython.pyx +++ b/gpython/_gpython.pyx @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # cython: language_level=2 -# Copyright (C) 2023 Nexedi SA and Contributors. -# Kirill Smelkov +# Copyright (C) 2023-2024 Nexedi SA and Contributors. +# Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute # it under the terms of the GNU General Public License version 3, or (at your diff --git a/gpython/_gpython_c.cpp b/gpython/_gpython_c.cpp index 05ba977..59fbb93 100644 --- a/gpython/_gpython_c.cpp +++ b/gpython/_gpython_c.cpp @@ -1,5 +1,5 @@ -// Copyright (C) 2023 Nexedi SA and Contributors. -// Kirill Smelkov +// Copyright (C) 2023-2024 Nexedi SA and Contributors. +// Kirill Smelkov // // This program is free software: you can Use, Study, Modify and Redistribute // it under the terms of the GNU General Public License version 3, or (at your diff --git a/gpython/gpython_test.py b/gpython/gpython_test.py index 355c2e7..85b97fb 100644 --- a/gpython/gpython_test.py +++ b/gpython/gpython_test.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2019-2023 Nexedi SA and Contributors. +# Copyright (C) 2019-2024 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute diff --git a/pyproject.toml b/pyproject.toml index 07ecad3..d28a182 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,2 +1,2 @@ [build-system] -requires = ["setuptools", "wheel", "setuptools_dso >= 2.7", "cython < 3", "gevent"] +requires = ["setuptools", "wheel", "setuptools_dso >= 2.8", "cython < 3", "gevent"] diff --git a/setup.py b/setup.py index f0539ee..9ef79dc 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ # pygolang | pythonic package setup -# Copyright (C) 2018-2023 Nexedi SA and Contributors. +# Copyright (C) 2018-2024 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -189,7 +189,7 @@ def install_egg_scripts(self, dist): # requirements of packages under "golang." namespace R = { 'cmd.pybench': {'pytest', 'py'}, - 'pyx.build': {'setuptools', 'wheel', 'cython < 3', 'setuptools_dso >= 2.7'}, + 'pyx.build': {'setuptools', 'wheel', 'cython < 3', 'setuptools_dso >= 2.8'}, 'x.perf.benchlib': {'numpy'}, } # TODO generate `a.b -> a`, e.g. x.perf = join(x.perf.*); x = join(x.*) @@ -575,7 +575,7 @@ def defif(name, ok): install_requires = ['gevent', 'six', 'decorator', 'Importing;python_version<="2.7"', # only runtime part: for dylink_prepare_dso - 'setuptools_dso >= 2.7', + 'setuptools_dso >= 2.8', # pyx.build -> setuptools_dso uses multiprocessing # setuptools_dso uses multiprocessing only on Python3, and only on systems where # mp.get_start_method()!='fork', while geventmp does not work on windows. @@ -611,6 +611,7 @@ def defif(name, ok): Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 Programming Language :: Python :: 3.11 + Programming Language :: Python :: 3.12 Programming Language :: Python :: Implementation :: CPython Programming Language :: Python :: Implementation :: PyPy Operating System :: POSIX diff --git a/tox.ini b/tox.ini index e99c48f..6833dce 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,6 @@ [tox] envlist = - {py27d,py27,py37,py38,py39d,py39,py310d,py310,py311d,py311,pypy,pypy3}-{thread,gevent} + {py27d,py27,py37,py38,py39d,py39,py310d,py310,py311d,py311,py312,pypy,pypy3}-{thread,gevent} # ThreadSanitizer @@ -10,18 +10,18 @@ envlist = # (*) PyPy locks its GIL (see RPyGilAcquire) by manually doing atomic cmpxchg # and other games, which TSAN cannot see if PyPy itself was not compiled with # -fsanitize=thread. - {py27d,py27,py37,py38,py39d,py39,py310d,py310,py311d,py311 }-{thread }-tsan + {py27d,py27,py37,py38,py39d,py39,py310d,py310,py311d,py311,py312 }-{thread }-tsan # XXX py*-gevent-tsan would be nice to have, but at present TSAN is not # effective with gevent, because it does not understand greenlet "thread" # switching and so perceives the program as having only one thread where races # are impossible. Disabled to save time. -# {py27d,py27,py37,py38,py39d,py39,py310d,py310,py311d,py311 }-{ gevent}-tsan +# {py27d,py27,py37,py38,py39d,py39,py310d,py310,py311d,py311,py312 }-{ gevent}-tsan # AddressSanitizer # XXX asan does not work with gevent: https://github.com/python-greenlet/greenlet/issues/113 - {py27d,py27,py37,py38,py39d,py39,py310d,py310,py311d,py311,pypy,pypy3}-{thread }-asan + {py27d,py27,py37,py38,py39d,py39,py310d,py310,py311d,py311,py312,pypy,pypy3}-{thread }-asan [testenv] basepython = @@ -35,6 +35,8 @@ basepython = py310: python3.10 py311d: python3.11-dbg py311: python3.11 + py312: python3.12 + py312d: python3.12-dbg pypy: pypy pypy3: pypy3 @@ -72,5 +74,5 @@ commands= # asan/tsan: tell pytest not to capture output - else it is not possible to see # reports from sanitizers because they crash tested process on error. # likewise for python debug builds. - asan,tsan,py{27,39,310,311}d: -s \ + asan,tsan,py{27,39,310,311,312}d: -s \ gpython/ golang/ diff --git a/trun b/trun index d9d260d..727e063 100755 --- a/trun +++ b/trun @@ -1,5 +1,5 @@ #!/usr/bin/env python -# Copyright (C) 2019-2020 Nexedi SA and Contributors. +# Copyright (C) 2019-2024 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -34,12 +34,13 @@ trun cares to run python with LD_PRELOAD set appropriately to /path/to/libtsan.s from __future__ import print_function, absolute_import -import os, sys, re, subprocess, pkgutil -import warnings -with warnings.catch_warnings(): - warnings.simplefilter('ignore', DeprecationWarning) - import imp +import os, sys, re, subprocess, types PY3 = (bytes is not str) +if PY3: + from importlib import machinery as imp_machinery +else: + import imp, pkgutil + # env_prepend prepends value to ${name} environment variable. # @@ -64,12 +65,15 @@ def grep1(pattern, text): # -> re.Match|None # to import e.g. golang.pyx.build, or locate golang._golang, without built/working golang. def ximport_empty_golangmod(): assert 'golang' not in sys.modules - golang = imp.new_module('golang') + golang = types.ModuleType('golang') golang.__package__ = 'golang' golang.__path__ = ['golang'] golang.__file__ = 'golang/__init__.py' - golang.__loader__ = pkgutil.ImpLoader('golang', None, 'golang/__init__.py', - [None, None, imp.PY_SOURCE]) + if PY3: + golang.__loader__ = imp_machinery.SourceFileLoader('golang', 'golang/__init__.py') + else: + golang.__loader__ = pkgutil.ImpLoader('golang', None, 'golang/__init__.py', + [None, None, imp.PY_SOURCE]) sys.modules['golang'] = golang From 4d64fd0f85ee4e177f83e9ebadd112e360ec6e9d Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Mon, 6 May 2024 11:51:31 +0300 Subject: [PATCH 22/29] X update (sync with master + ustr.translate fixes) --- .gitmodules | 3 + .lsan-ignore.txt | 124 +++++++++ 3rdparty/ratas | 1 + MANIFEST.in | 2 +- conftest.py | 30 ++ golang/_golang.pyx | 20 +- golang/_golang_str.pyx | 20 +- golang/_golang_test.pyx | 21 +- golang/golang_str_test.py | 22 +- golang/golang_test.py | 10 +- golang/libgolang.h | 9 +- golang/pyx/build_test.py | 11 +- golang/runtime/_libgolang.pxd | 4 +- golang/runtime/_runtime_gevent.pyx | 20 +- golang/runtime/_runtime_thread.pyx | 55 +++- golang/runtime/libgolang.cpp | 12 +- golang/time.cpp | 422 ++++++++++++++++++++++++----- golang/time.h | 14 +- golang/time_test.py | 79 ++++-- setup.py | 12 +- tox.ini | 19 +- trun | 55 +++- 22 files changed, 794 insertions(+), 171 deletions(-) create mode 100644 .lsan-ignore.txt create mode 160000 3rdparty/ratas diff --git a/.gitmodules b/.gitmodules index c279e31..0be964e 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ +[submodule "3rdparty/ratas"] + path = 3rdparty/ratas + url = https://github.com/jsnell/ratas.git [submodule "3rdparty/funchook"] path = 3rdparty/funchook url = https://github.com/kubo/funchook.git diff --git a/.lsan-ignore.txt b/.lsan-ignore.txt new file mode 100644 index 0000000..5e46ba3 --- /dev/null +++ b/.lsan-ignore.txt @@ -0,0 +1,124 @@ +# .lsan-ignore.txt lists memory leak events that LeakSanitizer should not +# report when running pygolang tests. +# +# Many python allocations, whose lifetime coincides with python interpreter +# lifetime, and which are not explicitly freed on python shutdown, are +# reported as leaks by default. Disable leak reporting for those to avoid +# non-pygolang related printouts. + + +# >>> Everything created when initializing python, e.g. sys.stderr +# #0 0x7f21e74f3bd7 in malloc .../asan_malloc_linux.cpp:69 +# #1 0x555f361ff9a4 in PyThread_allocate_lock Python/thread_pthread.h:385 +# #2 0x555f3623f72a in _buffered_init Modules/_io/bufferedio.c:725 +# #3 0x555f3623ff7e in _io_BufferedWriter___init___impl Modules/_io/bufferedio.c:1803 +# #4 0x555f3623ff7e in _io_BufferedWriter___init__ Modules/_io/clinic/bufferedio.c.h:489 +# #5 0x555f3610c086 in type_call Objects/typeobject.c:1103 +# #6 0x555f3609cdcc in _PyObject_MakeTpCall Objects/call.c:214 +# #7 0x555f3609d6a8 in _PyObject_VectorcallTstate Include/internal/pycore_call.h:90 +# #8 0x555f3609d6a8 in _PyObject_VectorcallTstate Include/internal/pycore_call.h:77 +# #9 0x555f3609d6a8 in _PyObject_CallFunctionVa Objects/call.c:536 +# #10 0x555f3609e89c in _PyObject_CallFunction_SizeT Objects/call.c:590 +# #11 0x555f3623a0df in _io_open_impl Modules/_io/_iomodule.c:407 +# #12 0x555f3623a0df in _io_open Modules/_io/clinic/_iomodule.c.h:264 +# #13 0x555f360f17da in cfunction_vectorcall_FASTCALL_KEYWORDS Objects/methodobject.c:443 +# #14 0x555f3609d54c in _PyObject_VectorcallTstate Include/internal/pycore_call.h:92 +# #15 0x555f3609d54c in _PyObject_CallFunctionVa Objects/call.c:536 +# #16 0x555f3609ec34 in callmethod Objects/call.c:608 +# #17 0x555f3609ec34 in _PyObject_CallMethod Objects/call.c:677 +# #18 0x555f361e60cf in create_stdio Python/pylifecycle.c:2244 +# #19 0x555f361e6523 in init_sys_streams Python/pylifecycle.c:2431 +# #20 0x555f361e6523 in init_interp_main Python/pylifecycle.c:1154 +# #21 0x555f361e7204 in pyinit_main Python/pylifecycle.c:1230 +# #22 0x555f361e85ba in Py_InitializeFromConfig Python/pylifecycle.c:1261 +# #23 0x555f3621010a in pymain_init Modules/main.c:67 +# #24 0x555f362113de in pymain_main Modules/main.c:701 +# #25 0x555f362113de in Py_BytesMain Modules/main.c:734 +leak:^pymain_init$ + +# >>> Everything created when importing py modules, e.g. +# #0 0x7f18c86f3bd7 in malloc .../asan_malloc_linux.cpp:69 +# #1 0x55b971430acf in PyMem_RawMalloc Objects/obmalloc.c:586 +# #2 0x55b971430acf in _PyObject_Malloc Objects/obmalloc.c:2003 +# #3 0x55b971430acf in _PyObject_Malloc Objects/obmalloc.c:1996 +# #4 0x55b971415696 in new_keys_object Objects/dictobject.c:632 +# #5 0x55b971415716 in dictresize Objects/dictobject.c:1429 +# #6 0x55b97141961a in insertion_resize Objects/dictobject.c:1183 +# #7 0x55b97141961a in insertdict Objects/dictobject.c:1248 +# #8 0x55b97143eb7b in add_subclass Objects/typeobject.c:6547 +# #9 0x55b97144ca52 in type_ready_add_subclasses Objects/typeobject.c:6345 +# #10 0x55b97144ca52 in type_ready Objects/typeobject.c:6476 +# #11 0x55b971451a1f in PyType_Ready Objects/typeobject.c:6508 +# #12 0x55b971451a1f in type_new_impl Objects/typeobject.c:3189 +# #13 0x55b971451a1f in type_new Objects/typeobject.c:3323 +# #14 0x55b971443014 in type_call Objects/typeobject.c:1091 +# #15 0x55b9713d3dcc in _PyObject_MakeTpCall Objects/call.c:214 +# #16 0x55b9713d47bd in _PyObject_FastCallDictTstate Objects/call.c:141 +# #17 0x55b9713d47bd in PyObject_VectorcallDict Objects/call.c:165 +# #18 0x55b9714d14c2 in builtin___build_class__ Python/bltinmodule.c:209 +# #19 0x55b9714287da in cfunction_vectorcall_FASTCALL_KEYWORDS Objects/methodobject.c:443 +# #20 0x55b9713d4a7b in _PyObject_VectorcallTstate Include/internal/pycore_call.h:92 +# #21 0x55b9713d4a7b in PyObject_Vectorcall Objects/call.c:299 +# #22 0x55b97137666e in _PyEval_EvalFrameDefault Python/ceval.c:4769 +# #23 0x55b9714d7e6b in _PyEval_EvalFrame Include/internal/pycore_ceval.h:73 +# #24 0x55b9714d7e6b in _PyEval_Vector Python/ceval.c:6434 +# #25 0x55b9714d7e6b in PyEval_EvalCode Python/ceval.c:1148 +# #26 0x55b9714d2e1f in builtin_exec_impl Python/bltinmodule.c:1077 +# #27 0x55b9714d2e1f in builtin_exec Python/clinic/bltinmodule.c.h:465 +# #28 0x55b9714287da in cfunction_vectorcall_FASTCALL_KEYWORDS Objects/methodobject.c:443 +# #29 0x55b971376dcb in do_call_core Python/ceval.c:7349 +# #30 0x55b971376dcb in _PyEval_EvalFrameDefault Python/ceval.c:5376 +# #31 0x55b9714d7faf in _PyEval_EvalFrame Include/internal/pycore_ceval.h:73 +# #32 0x55b9714d7faf in _PyEval_Vector Python/ceval.c:6434 +# #33 0x55b9713d436e in _PyObject_VectorcallTstate Include/internal/pycore_call.h:92 +# #34 0x55b9713d436e in object_vacall Objects/call.c:819 +# #35 0x55b9713d63cf in PyObject_CallMethodObjArgs Objects/call.c:879 +# #36 0x55b9715080e1 in import_find_and_load Python/import.c:1748 +# #37 0x55b9715080e1 in PyImport_ImportModuleLevelObject Python/import.c:1847 +# #38 0x55b97137de9c in import_name Python/ceval.c:7422 +# #39 0x55b97137de9c in _PyEval_EvalFrameDefault Python/ceval.c:3946 +# #40 0x55b9714d7e6b in _PyEval_EvalFrame Include/internal/pycore_ceval.h:73 +# #41 0x55b9714d7e6b in _PyEval_Vector Python/ceval.c:6434 +# #42 0x55b9714d7e6b in PyEval_EvalCode Python/ceval.c:1148 +# #43 0x55b9714d2e1f in builtin_exec_impl Python/bltinmodule.c:1077 +# #44 0x55b9714d2e1f in builtin_exec Python/clinic/bltinmodule.c.h:465 +# #45 0x55b9714287da in cfunction_vectorcall_FASTCALL_KEYWORDS Objects/methodobject.c:443 +# #46 0x55b971376dcb in do_call_core Python/ceval.c:7349 +# #47 0x55b971376dcb in _PyEval_EvalFrameDefault Python/ceval.c:5376 +leak:^PyImport_Import +# importlib.import_module leads to +# #0 0x7f1951ef3bd7 in malloc ../../../../src/libsanitizer/asan/asan_malloc_linux.cpp:69 +# #1 0x55f399e8cacf in PyMem_RawMalloc Objects/obmalloc.c:586 +# #2 0x55f399e8cacf in _PyObject_Malloc Objects/obmalloc.c:2003 +# #3 0x55f399e8cacf in _PyObject_Malloc Objects/obmalloc.c:1996 +# #4 0x55f399e86344 in PyModule_ExecDef Objects/moduleobject.c:400 +# #5 0x55f399f6178a in exec_builtin_or_dynamic Python/import.c:2345 +# #6 0x55f399f6178a in _imp_exec_dynamic_impl Python/import.c:2419 +# #7 0x55f399f6178a in _imp_exec_dynamic Python/clinic/import.c.h:474 +# #8 0x55f399e8438a in cfunction_vectorcall_O Objects/methodobject.c:514 +leak:^_imp_exec_dynamic + + +# >>> Everything allocated at DSO initialization, e.g. +# #0 0x7f35d2af46c8 in operator new(unsigned long) .../asan_new_delete.cpp:95 +# #1 0x7f35ce897e9f in __static_initialization_and_destruction_0 golang/context.cpp:61 +# #2 0x7f35ce8982ef in _GLOBAL__sub_I_context.cpp golang/context.cpp:380 +# #3 0x7f35d32838bd in call_init elf/dl-init.c:90 +# #4 0x7f35d32838bd in call_init elf/dl-init.c:27 +# #5 0x7f35d32839a3 in _dl_init elf/dl-init.c:137 +# #6 0x7f35d256e023 in __GI__dl_catch_exception elf/dl-error-skeleton.c:182 +# #7 0x7f35d328a09d in dl_open_worker elf/dl-open.c:808 +# #8 0x7f35d256dfc9 in __GI__dl_catch_exception elf/dl-error-skeleton.c:208 +# #9 0x7f35d328a437 in _dl_open elf/dl-open.c:884 +# #10 0x7f35d24a4437 in dlopen_doit dlfcn/dlopen.c:56 +# #11 0x7f35d256dfc9 in __GI__dl_catch_exception elf/dl-error-skeleton.c:208 +# #12 0x7f35d256e07e in __GI__dl_catch_error elf/dl-error-skeleton.c:227 +# #13 0x7f35d24a3f26 in _dlerror_run dlfcn/dlerror.c:138 +# #14 0x7f35d24a44e8 in dlopen_implementation dlfcn/dlopen.c:71 +# #15 0x7f35d24a44e8 in ___dlopen dlfcn/dlopen.c:81 +# #16 0x7f35d2a77ff9 in dlopen .../sanitizer_common_interceptors.inc:6341 +leak:^_GLOBAL_ + + +# global<> does not deallocate its reference on purpose +leak:^_test_global()$ diff --git a/3rdparty/ratas b/3rdparty/ratas new file mode 160000 index 0000000..becd5fc --- /dev/null +++ b/3rdparty/ratas @@ -0,0 +1 @@ +Subproject commit becd5fc5c1e9ea600cd8b3b1c24d564794fedac4 diff --git a/MANIFEST.in b/MANIFEST.in index e2cae70..17a041e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,4 @@ -include COPYING README.rst CHANGELOG.rst tox.ini pyproject.toml trun .nxdtest +include COPYING README.rst CHANGELOG.rst tox.ini pyproject.toml trun .lsan-ignore.txt .nxdtest conftest.py include golang/libgolang.h include golang/runtime/libgolang.cpp include golang/runtime/libpyxruntime.cpp diff --git a/conftest.py b/conftest.py index 1ca5c1b..1f37bdd 100644 --- a/conftest.py +++ b/conftest.py @@ -1,3 +1,33 @@ +# pygolang | pytest config +# Copyright (C) 2021-2024 Nexedi SA and Contributors. +# Kirill Smelkov +# +# This program is free software: you can Use, Study, Modify and Redistribute +# it under the terms of the GNU General Public License version 3, or (at your +# option) any later version, as published by the Free Software Foundation. +# +# You can also Link and Combine this program with other software covered by +# the terms of any of the Free Software licenses or any of the Open Source +# Initiative approved licenses and Convey the resulting work. Corresponding +# source of such a combination shall include the source code for all other +# software used. +# +# This program is distributed WITHOUT ANY WARRANTY; without even the implied +# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# See COPYING file for full licensing terms. +# See https://www.nexedi.com/licensing for rationale and options. + +from __future__ import print_function, absolute_import + +import gc + + +# Do full GC before pytest exits, to avoid false positives in the leak detector. +def pytest_unconfigure(): + gc.collect() + + # ignore tests in distorm - else it breaks as e.g. # # 3rdparty/funchook/distorm/python/test_distorm3.py:15: in diff --git a/golang/_golang.pyx b/golang/_golang.pyx index 689d6a1..2a0ba56 100644 --- a/golang/_golang.pyx +++ b/golang/_golang.pyx @@ -173,11 +173,18 @@ cdef void __goviac(void *arg) nogil: # ---- channels ---- +# _frompyx indicates that a constructor is called from pyx code +cdef object _frompyx = object() + @final cdef class pychan: def __cinit__(pychan pych, size=0, dtype=object): - pych.dtype = parse_dtype(dtype) - pych._ch = _makechan_pyexc(dtypeRegistry[pych.dtype].size, size) + if dtype is _frompyx: + pych.dtype = DTYPE_STRUCTZ # anything + pych._ch = NULL + else: + pych.dtype = parse_dtype(dtype) + pych._ch = _makechan_pyexc(dtypeRegistry[pych.dtype].size, size) # pychan.nil(X) creates new nil pychan with specified dtype. # TODO try to avoid exposing .nil on pychan instances, and expose only pychan.nil @@ -370,7 +377,7 @@ cdef void pychan_asserttype(pychan pych, DType dtype) nogil: panic("pychan: channel type mismatch") cdef pychan pychan_from_raw(_chan *_ch, DType dtype): - cdef pychan pych = pychan.__new__(pychan) + cdef pychan pych = pychan.__new__(pychan, dtype=_frompyx) pych.dtype = dtype pych._ch = _ch; _chanxincref(_ch) return pych @@ -626,9 +633,7 @@ cdef object c_to_py(DType dtype, const chanElemBuf *cfrom): # mkpynil creates pychan instance that represents nil[dtype]. cdef PyObject *mkpynil(DType dtype): - cdef pychan pynil = pychan.__new__(pychan) - pynil.dtype = dtype - pynil._ch = NULL # should be already NULL + cdef pychan pynil = pychan_from_raw(NULL, dtype) Py_INCREF(pynil) return pynil @@ -818,9 +823,6 @@ from libcpp.typeinfo cimport type_info from cython.operator cimport typeid from libc.string cimport strcmp -# _frompyx indicates that a constructor is called from pyx code -cdef object _frompyx = object() - cdef class pyerror(Exception): # pyerror <- error @staticmethod diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index 3e4a64f..6172711 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -580,6 +580,7 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 def title(self): return pyb(pyu(self).title()) def translate(self, table, delete=None): # bytes mode (compatibility with str/py2) + # XXX isinstance(zbytes) -> isinstance(bytes) ? if table is None or isinstance(table, zbytes) or delete is not None: if delete is None: delete = b'' return pyb(zbytes.translate(self, table, delete)) @@ -905,12 +906,7 @@ cdef class _pyustr(unicode): def translate(self, table): # unicode.translate does not accept bstr values - t = {} - for k,v in table.items(): - if not isinstance(v, int): # either unicode ordinal, - v = _xpyu_coerce(v) # character or None - t[k] = v - return pyu(zunicode.translate(self, t)) + return pyu(zunicode.translate(self, _pyustrTranslateTab(table))) def upper(self): return pyu(zunicode.upper(self)) def zfill(self, width): return pyu(zunicode.zfill(self, width)) @@ -983,6 +979,18 @@ cdef class _pyustrIter: x = next(self.uiter) return pyu(x) +# _pyustrTranslateTab wraps table for .translate to return bstr as unicode +# because unicode.translate does not accept bstr values. +cdef class _pyustrTranslateTab: + cdef object tab + def __init__(self, tab): + self.tab = tab + def __getitem__(self, k): + v = self.tab[k] + if not isinstance(v, int): # either unicode ordinal, + v = _xpyu_coerce(v) # character or None + return v + # _bdata/_udata retrieve raw data from bytes/unicode. def _bdata(obj): # -> bytes diff --git a/golang/_golang_test.pyx b/golang/_golang_test.pyx index d029ce0..3c9f60e 100644 --- a/golang/_golang_test.pyx +++ b/golang/_golang_test.pyx @@ -2,7 +2,7 @@ # cython: language_level=2 # distutils: language=c++ # -# Copyright (C) 2018-2020 Nexedi SA and Contributors. +# Copyright (C) 2018-2024 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -344,6 +344,25 @@ cdef nogil: pych.chan_double().close() +# verify that pychan_from_raw is not leaking C channel. +def test_pychan_from_raw_noleak(): + # pychan_from_raw used to create another channel and leak it + # + # this test _implicitly_ verifies that it is no longer the case - if it is, + # LSAN will report a memory leak after running the test. + # + # TODO consider adding explicit verification effective even under regular + # builds. Possible options: + # + # * verify malloc totals before and after tested code + # see e.g. https://stackoverflow.com/q/1761125/9456786 + # * hook _makechan and verify that it is not invoked from under + # pychan_from_raw. Depends on funchook integration. + cdef chan[int] ch = makechan[int]() + cdef pychan pych = pychan.from_chan_int(ch) # uses pychan_from_raw internally + # pych and ch are freed automatically + + # ---- benchmarks ---- # bench_go_nogil mirrors golang_test.py:bench_go diff --git a/golang/golang_str_test.py b/golang/golang_str_test.py index 975584a..6f88ad5 100644 --- a/golang/golang_str_test.py +++ b/golang/golang_str_test.py @@ -1567,7 +1567,10 @@ def test_strings_methods(): # checkop verifies that `s.meth(*argv, **kw)` gives the same result for s, # argv and kw being various combinations of unicode,bstr,ustr, bytes/bytearray. def checkop(s, meth, *argv, **kw): - assert type(s) is str + if six.PY3: + assert type(s) is str + else: + assert type(s) in (str, unicode) # some tests use unicode because \u does not work in str literals ok = kw.pop('ok') if six.PY2: ok = deepReplaceStr(ok, xunicode) @@ -1738,7 +1741,7 @@ def _(*argv, **kw): _("123").isnumeric( ok=True) _("0x123").isnumeric( ok=False) _("мир").isprintable( ok=True, optional=True) # py3.0 - _("\u2009").isspace( ok=x32(True,False)) # thin space + _(u"\u2009").isspace( ok=True) # thin space _(" ").isspace( ok=True) _("мир").isspace( ok=False) _("мир").istitle( ok=False) @@ -1748,8 +1751,8 @@ def _(*argv, **kw): _("мир").ljust(10, ok="мир ") _("мир").ljust(10, 'ж', ok="миржжжжжжж") _("МиР").lower( ok="мир") - _("\u2009 мир").lstrip( ok=x32("мир", "\u2009 мир")) - _("\u2009 мир\u2009 ").lstrip( ok=x32("мир\u2009 ", "\u2009 мир\u2009 ")) + _(u"\u2009 мир").lstrip( ok="мир") + _(u"\u2009 мир\u2009 ").lstrip( ok=u"мир\u2009 ") _("мммир").lstrip('ми', ok="р") _("миру мир").partition('ру', ok=("ми", "ру", " мир")) _("миру мир").partition('ж', ok=("миру мир", "", "")) @@ -1764,15 +1767,15 @@ def _(*argv, **kw): _("миру мир").rpartition('ж', ok=("", "", "миру мир")) _("мир").rsplit( ok=["мир"]) _("привет мир").rsplit( ok=["привет", "мир"]) - _("привет\u2009мир").rsplit( ok=x32(["привет", "мир"], ["привет\u2009мир"])) + _(u"привет\u2009мир").rsplit( ok=["привет", "мир"]) _("привет мир").rsplit("и", ok=["пр", "вет м", "р"]) _("привет мир").rsplit("и", 1, ok=["привет м", "р"]) - _("мир \u2009").rstrip( ok=x32("мир", "мир \u2009")) - _(" мир \u2009").rstrip( ok=x32(" мир", " мир \u2009")) + _(u"мир \u2009").rstrip( ok="мир") + _(u" мир \u2009").rstrip( ok=" мир") _("мируу").rstrip('ру', ok="ми") _("мир").split( ok=["мир"]) _("привет мир").split( ok=["привет", "мир"]) - _("привет\u2009мир").split( ok=x32(['привет', 'мир'], ["привет\u2009мир"])) + _(u"привет\u2009мир").split( ok=['привет', 'мир']) _("привет мир").split("и", ok=["пр", "вет м", "р"]) _("привет мир").split("и", 1, ok=["пр", "вет мир"]) _("мир").splitlines( ok=["мир"]) @@ -1782,11 +1785,12 @@ def _(*argv, **kw): _("мир\nтруд\nмай\n").splitlines( ok=["мир", "труд", "май"]) _("мир\nтруд\nмай\n").splitlines(True, ok=["мир\n", "труд\n", "май\n"]) # startswith - tested in test_strings_index - _("\u2009 мир \u2009").strip( ok=x32("мир", "\u2009 мир \u2009")) + _(u"\u2009 мир \u2009").strip( ok="мир") _("миру мир").strip('мир', ok="у ") _("МиР").swapcase( ok="мИр") _("МиР").title( ok="Мир") _("мир").translate({ord(u'м'):ord(u'и'), ord(u'и'):'я', ord(u'р'):None}, ok="ия") + _(u"\u0000\u0001\u0002.").translate([u'м', ord(u'и'), None], ok="ми.") _("МиР").upper( ok="МИР") _("мир").zfill(10, ok="0000000мир") _("123").zfill(10, ok="0000000123") diff --git a/golang/golang_test.py b/golang/golang_test.py index 46d6e95..ce9bd0f 100644 --- a/golang/golang_test.py +++ b/golang/golang_test.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Nexedi SA and Contributors. +# Copyright (C) 2018-2024 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -74,7 +74,8 @@ def _(b, func=getattr(mod, f)): # leaked goroutine behaviour check: done in separate process because we need # to test process termination exit there. def test_go_leaked(): - pyrun([dir_testprog + "/golang_test_goleaked.py"]) + pyrun([dir_testprog + "/golang_test_goleaked.py"], + lsan=False) # there are on-purpose leaks in this test # benchmark go+join a thread/coroutine. # pyx/nogil mirror is in _golang_test.pyx @@ -1756,6 +1757,11 @@ def _pyrun(argv, stdin=None, stdout=None, stderr=None, **kw): # -> retcode, st assert len(enc) == 1 env['PYTHONIOENCODING'] = enc.pop() + # disable LeakSanitizer if requested, e.g. when test is known to leak something on purpose + lsan = kw.pop('lsan', True) + if not lsan: + env['ASAN_OPTIONS'] = env.get('ASAN_OPTIONS', '') + ',detect_leaks=0' + p = Popen(argv, stdin=(PIPE if stdin else None), stdout=stdout, stderr=stderr, env=env, **kw) stdout, stderr = p.communicate(stdin) diff --git a/golang/libgolang.h b/golang/libgolang.h index 4131a84..b606bc5 100644 --- a/golang/libgolang.h +++ b/golang/libgolang.h @@ -345,8 +345,13 @@ typedef struct _libgolang_runtime_ops { // previously successfully allocated via sema_alloc. void (*sema_free) (_libgolang_sema*); - // sema_acquire/sema_release should acquire/release live semaphore allocated via sema_alloc. - void (*sema_acquire)(_libgolang_sema*); + // sema_acquire should try to acquire live semaphore allocated via sema_alloc during given time. + // it returns whether acquisition succeeded or timed out. + // the timeout is specified in nanoseconds. + // UINT64_MAX means no timeout. + bool (*sema_acquire)(_libgolang_sema*, uint64_t timeout_ns); + + // sema_release should release live semaphore allocated via sema_alloc. void (*sema_release)(_libgolang_sema*); // nanosleep should pause current goroutine for at least dt nanoseconds. diff --git a/golang/pyx/build_test.py b/golang/pyx/build_test.py index af03136..503aa17 100644 --- a/golang/pyx/build_test.py +++ b/golang/pyx/build_test.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2019 Nexedi SA and Contributors. -# Kirill Smelkov +# Copyright (C) 2019-2024 Nexedi SA and Contributors. +# Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute # it under the terms of the GNU General Public License version 3, or (at your @@ -28,7 +28,8 @@ # verify that we can build/run external package that uses pygolang in pyx mode. def test_pyx_build(): pyxuser = testprog + "/golang_pyx_user" - pyrun(["setup.py", "build_ext", "-i"], cwd=pyxuser) + pyrun(["setup.py", "build_ext", "-i"], cwd=pyxuser, + lsan=False) # gcc leaks # run built test. _ = pyout(["-c", @@ -44,8 +45,8 @@ def test_pyx_build(): # verify that we can build/run external dso that uses libgolang. def test_dso_build(): dsouser = testprog + "/golang_dso_user" - pyrun(["setup.py", "build_dso", "-i"], cwd=dsouser) - pyrun(["setup.py", "build_ext", "-i"], cwd=dsouser) + pyrun(["setup.py", "build_dso", "-i"], cwd=dsouser, lsan=False) # gcc leaks + pyrun(["setup.py", "build_ext", "-i"], cwd=dsouser, lsan=False) # gcc leaks # run built test. _ = pyout(["-c", diff --git a/golang/runtime/_libgolang.pxd b/golang/runtime/_libgolang.pxd index 958395a..edbc5eb 100644 --- a/golang/runtime/_libgolang.pxd +++ b/golang/runtime/_libgolang.pxd @@ -1,5 +1,5 @@ # cython: language_level=2 -# Copyright (C) 2019-2022 Nexedi SA and Contributors. +# Copyright (C) 2019-2024 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -36,7 +36,7 @@ cdef extern from "golang/libgolang.h" namespace "golang" nogil: _libgolang_sema* (*sema_alloc) () void (*sema_free) (_libgolang_sema*) - void (*sema_acquire)(_libgolang_sema*) + bint (*sema_acquire)(_libgolang_sema*, uint64_t timeout_ns) void (*sema_release)(_libgolang_sema*) void (*nanosleep)(uint64_t) diff --git a/golang/runtime/_runtime_gevent.pyx b/golang/runtime/_runtime_gevent.pyx index dcf4f33..b05ae68 100644 --- a/golang/runtime/_runtime_gevent.pyx +++ b/golang/runtime/_runtime_gevent.pyx @@ -1,5 +1,5 @@ # cython: language_level=2 -# Copyright (C) 2019-2023 Nexedi SA and Contributors. +# Copyright (C) 2019-2024 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -40,7 +40,10 @@ ELSE: from gevent import sleep as pygsleep -from libc.stdint cimport uint64_t +from libc.stdint cimport uint8_t, uint64_t, UINT64_MAX +cdef extern from *: + ctypedef bint cbool "bool" + from cpython cimport PyObject, Py_INCREF, Py_DECREF from cython cimport final @@ -95,9 +98,12 @@ cdef: Py_DECREF(pygsema) return True - bint _sema_acquire(_libgolang_sema *gsema): + bint _sema_acquire(_libgolang_sema *gsema, uint64_t timeout_ns, cbool* pacq): pygsema = gsema - pygsema.acquire() + timeout = None + if timeout_ns != UINT64_MAX: + timeout = float(timeout_ns) * 1e-9 + pacq[0] = pygsema.acquire(timeout=timeout) return True bint _sema_release(_libgolang_sema *gsema): @@ -142,14 +148,16 @@ cdef nogil: if not ok: panic("pyxgo: gevent: sema: free: failed") - void sema_acquire(_libgolang_sema *gsema): + cbool sema_acquire(_libgolang_sema *gsema, uint64_t timeout_ns): cdef PyExc exc + cdef cbool acq with gil: pyexc_fetch(&exc) - ok = _sema_acquire(gsema) + ok = _sema_acquire(gsema, timeout_ns, &acq) pyexc_restore(exc) if not ok: panic("pyxgo: gevent: sema: acquire: failed") + return acq void sema_release(_libgolang_sema *gsema): cdef PyExc exc diff --git a/golang/runtime/_runtime_thread.pyx b/golang/runtime/_runtime_thread.pyx index 288de3d..4325e89 100644 --- a/golang/runtime/_runtime_thread.pyx +++ b/golang/runtime/_runtime_thread.pyx @@ -1,5 +1,5 @@ # cython: language_level=2 -# Copyright (C) 2019-2022 Nexedi SA and Contributors. +# Copyright (C) 2019-2024 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -35,7 +35,12 @@ from __future__ import print_function, absolute_import # # NOTE Cython declares PyThread_acquire_lock/PyThread_release_lock as nogil from cpython.pythread cimport PyThread_acquire_lock, PyThread_release_lock, \ - PyThread_type_lock, WAIT_LOCK + PyThread_type_lock, WAIT_LOCK, NOWAIT_LOCK, PyLockStatus, PY_LOCK_ACQUIRED, PY_LOCK_FAILURE + +cdef extern from * nogil: + ctypedef int PY_TIMEOUT_T # long long there + PyLockStatus PyThread_acquire_lock_timed(PyThread_type_lock, PY_TIMEOUT_T timeout_us, int intr_flag) + # NOTE On Darwin, even though this is considered as POSIX, Python uses # mutex+condition variable to implement its lock, and, as of 20190828, Py2.7 @@ -98,6 +103,9 @@ from libc.errno cimport errno, EINTR, EBADF from posix.fcntl cimport mode_t from posix.stat cimport struct_stat from posix.strings cimport bzero +cdef extern from *: + ctypedef bint cbool "bool" + IF POSIX: from posix.time cimport clock_gettime, nanosleep as posix_nanosleep, timespec, CLOCK_REALTIME ELSE: @@ -138,11 +146,46 @@ cdef nogil: pysema = gsema PyThread_free_lock(pysema) - void sema_acquire(_libgolang_sema *gsema): + cbool sema_acquire(_libgolang_sema *gsema, uint64_t timeout_ns): pysema = gsema - ok = PyThread_acquire_lock(pysema, WAIT_LOCK) - if ok == 0: - panic("pyxgo: thread: sema_acquire: PyThread_acquire_lock failed") + IF PY3: + cdef PY_TIMEOUT_T timeout_us + ELSE: + cdef uint64_t tprev, t, tsleep + if timeout_ns == UINT64_MAX: + ok = PyThread_acquire_lock(pysema, WAIT_LOCK) + if ok == 0: + panic("pyxgo: thread: sema_acquire: PyThread_acquire_lock failed") + return 1 + else: + IF PY3: + timeout_us = timeout_ns // 1000 + lkok = PyThread_acquire_lock_timed(pysema, timeout_us, 0) + if lkok == PY_LOCK_FAILURE: + return 0 + elif lkok == PY_LOCK_ACQUIRED: + return 1 + else: + panic("pyxgo: thread: sema_acquire: PyThread_acquire_lock_timed failed") + ELSE: + # py2 misses PyThread_acquire_lock_timed - provide fallback ourselves + tprev = nanotime() + while 1: + ok = PyThread_acquire_lock(pysema, NOWAIT_LOCK) + if ok: + return 1 + tsleep = min(timeout_ns, 50*1000) # poll every 50 μs = 20 Hz + if tsleep == 0: + break + nanosleep(tsleep) + t = nanotime() + if t < tprev: + break # clock skew + if t - tprev >= timeout_ns: + break + timeout_ns -= t - tprev + tprev = t + return 0 void sema_release(_libgolang_sema *gsema): pysema = gsema diff --git a/golang/runtime/libgolang.cpp b/golang/runtime/libgolang.cpp index f91772a..a6a288a 100644 --- a/golang/runtime/libgolang.cpp +++ b/golang/runtime/libgolang.cpp @@ -131,6 +131,7 @@ using internal::_runtime; namespace internal { namespace atomic { extern void _init(); } } namespace os { namespace signal { extern void _init(); } } +namespace time { extern void _init(); } void _libgolang_init(const _libgolang_runtime_ops *runtime_ops) { if (_runtime != nil) // XXX better check atomically panic("libgolang: double init"); @@ -138,6 +139,7 @@ void _libgolang_init(const _libgolang_runtime_ops *runtime_ops) { internal::atomic::_init(); os::signal::_init(); + time::_init(); } void _taskgo(void (*f)(void *), void *arg) { @@ -166,7 +168,15 @@ void _semafree(_sema *sema) { } void _semaacquire(_sema *sema) { - _runtime->sema_acquire((_libgolang_sema *)sema); + bool ok; + ok = _runtime->sema_acquire((_libgolang_sema *)sema, UINT64_MAX); + if (!ok) + panic("semaacquire: failed"); +} + +// NOTE not currently exposed in public API +bool _semaacquire_timed(_sema *sema, uint64_t timeout_ns) { + return _runtime->sema_acquire((_libgolang_sema *)sema, timeout_ns); } void _semarelease(_sema *sema) { diff --git a/golang/time.cpp b/golang/time.cpp index b644b4c..6e893f9 100644 --- a/golang/time.cpp +++ b/golang/time.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2020 Nexedi SA and Contributors. +// Copyright (C) 2019-2024 Nexedi SA and Contributors. // Kirill Smelkov // // This program is free software: you can Use, Study, Modify and Redistribute @@ -21,8 +21,24 @@ // See time.h for package overview. #include "golang/time.h" +#include "timer-wheel.h" -#include + +#define DEBUG 0 +#if DEBUG +# define debugf(format, ...) fprintf(stderr, format, ##__VA_ARGS__) +#else +# define debugf(format, ...) do {} while (0) +#endif + + +// golang::sync:: (private imports) +namespace golang { +namespace sync { + +bool _semaacquire_timed(_sema *sema, uint64_t timeout_ns); + +}} // golang::sync:: // golang::time:: (except sleep and now) @@ -30,7 +46,6 @@ namespace golang { namespace time { // ---- timers ---- -// FIXME timers are implemented very inefficiently - each timer currently consumes a goroutine. Ticker new_ticker(double dt); Timer new_timer (double dt); @@ -51,7 +66,12 @@ Timer after_func(double dt, func f) { return _new_timer(dt, f); } -// Ticker +Timer new_timer(double dt) { + return _new_timer(dt, nil); +} + + +// Ticker (small wrapper around Timer) _Ticker::_Ticker() {} _Ticker::~_Ticker() {} void _Ticker::decref() { @@ -67,9 +87,7 @@ Ticker new_ticker(double dt) { tx->c = makechan(1); // 1-buffer -- same as in Go tx->_dt = dt; tx->_stop = false; - go([tx]() { - tx->_tick(); - }); + tx->_timer = after_func(dt, [tx]() { tx ->_tick(); }); return tx; } @@ -78,6 +96,10 @@ void _Ticker::stop() { tx._mu.lock(); tx._stop = true; + if (tx._timer != nil) { + tx._timer->stop(); + tx._timer = nil; // break Ticker -> Timer -> _tick -> Ticker cycle + } // drain what _tick could have been queued already while (tx.c.len() > 0) @@ -88,113 +110,379 @@ void _Ticker::stop() { void _Ticker::_tick() { _Ticker &tx = *this; - while (1) { - // XXX adjust for accumulated error δ? - sleep(tx._dt); - - tx._mu.lock(); - if (tx._stop) { - tx._mu.unlock(); - return; - } - - // send from under ._mu so that .stop can be sure there is no - // ongoing send while it drains the channel. - double t = now(); - select({ - _default, - tx.c.sends(&t), - }); + tx._mu.lock(); + if (tx._stop) { tx._mu.unlock(); + return; } + + // XXX adjust for accumulated error δ? + tx._timer->reset(tx._dt); + + // send from under ._mu so that .stop can be sure there is no + // ongoing send while it drains the channel. + double t = now(); + select({ + _default, + tx.c.sends(&t), + }); + tx._mu.unlock(); } -// Timer +// Timers +// +// Timers are implemented via Timer Wheel. +// For this time arrow is divided into equal periods named ticks, and Ratas +// library[1] is used to manage timers with granularity of ticks. We employ +// ticks to avoid unnecessary overhead of managing timeout-style timers with +// nanosecond precision. +// +// Let g denote tick granularity. +// +// The timers are provided with guaranty that their expiration happens after +// requested expiration time. In other words the following invariant is always true: +// +// t(exp) ≤ t(fire) +// +// we also want that firing _ideally_ happens not much far away from requested +// expiration time, meaning that the following property is aimed for, but not guaranteed: +// +// t(fire) < t(exp) + g +// +// a tick Ti is associated with [i-1,i)·g time range. It is said that tick Ti +// "happens" at i·g point in time. Firing of timers associated with tick Ti is +// done when Ti happens - ideally at i·g time or strictly speaking ≥ that point. +// +// When timers are armed their expiration tick is set as Texp = ⌊t(exp)/g+1⌋ to +// be in time range that tick Texp covers. +// +// +// A special goroutine, _timer_loop, is dedicated to advance time of the +// timer-wheel as ticks happen, and to run expired timers. When there is +// nothing to do that goroutine pauses itself and goes to sleep until either +// next expiration moment, or until new timer with earlier expiration time is +// armed. To be able to simultaneously select on those two condition a +// semaphore with acquisition timeout is employed. Please see _tSema for +// details. +// +// +// [1] Ratas - A hierarchical timer wheel. +// https://www.snellman.net/blog/archive/2016-07-27-ratas-hierarchical-timer-wheel, +// https://github.com/jsnell/ratas + +// Tns indicates time measured in nanoseconds. +// It is used for documentation purposes mainly to distinguish from the time measured in ticks. +typedef uint64_t Tns; + +// _tick_g is ticks granularity in nanoseconds. +static const Tns _tick_g = 1024; // 1 tick is ~ 1 μs + + +// timer-wheel holds registry of all timers and manages them. +static sync::Mutex* _tWheelMu; // lock for timer wheel + sleep/wakeup channel (see _tSema & co below) +static TimerWheel* _tWheel; // for each timer the wheel holds 1 reference to _TimerImpl object + +// _TimerImpl amends _Timer with timer-wheel entry and implementation-specific state. +enum _TimerState { + _TimerDisarmed, // timer is not registered to timer wheel and is not firing + _TimerArmed, // timer is registered to timer wheel and is not firing + _TimerFiring // timer is currently firing (and not on the timer wheel) +}; +struct _TimerImpl : _Timer { + void _fire(); + void _queue_fire(); + MemberTimerEvent<_TimerImpl, &_TimerImpl::_queue_fire> _tWheelEntry; + + func _f; + + sync::Mutex _mu; + _TimerState _state; + + // entry on "firing" list; see _tFiring for details + _TimerImpl* _tFiringNext; // TODO could reuse _tWheelEntry.{next_,prev_} for "firing" list + + _TimerImpl(); + ~_TimerImpl(); +}; + +_TimerImpl::_TimerImpl() : _tWheelEntry(this) {} +_TimerImpl::~_TimerImpl() {} + _Timer::_Timer() {} _Timer::~_Timer() {} void _Timer::decref() { if (__decref()) - delete this; + delete static_cast<_TimerImpl*>(this); +} + + +// _tSema and _tSleeping + _tWaking organize sleep/wakeup channel. +// +// Timer loop uses wakeup sema to both: +// * sleep until next timer expires, and +// * become woken up earlier if new timer with earlier expiration time is armed +// +// _tSleeping + _tWaking are used by the timer loop and clients to coordinate +// _tSema operations, so that the value of sema is always 0 or 1, and that +// every new loop cycle starts with sema=0, meaning that sema.Acquire will block. +// +// Besides same.Acquire, all operations on the sleep/wakeup channel are done under _tWheelMu. +static sync::_sema* _tSema; +static bool _tSleeping; // 1 iff timer loop: + // \/ decided to go to sleep on wakeup sema + // \/ sleeps on wakeup sema via Acquire + // \/ woken up after Acquire before setting _tSleeping=0 back +static bool _tWaking; // 1 iff client timer arm: + // /\ saw _tSleeping=1 && _tWaking=0 and decided to do wakeup + // /\ (did Release \/ will do Release) + // /\ until timer loop set back _tWaking=0 +static Tns _tSleeping_until; // until when timer loop is sleeping if _tSleeping=1 + + +// _timer_loop implements timer loop: it runs in dedicated goroutine ticking the +// timer-wheel and sleeping in between ticks. +static void _timer_loop(); +static void _timer_loop_fire_queued(); +void _init() { + _tWheelMu = new sync::Mutex(); + _tWheel = new TimerWheel(_nanotime() / _tick_g); + _tSema = sync::_makesema(); sync::_semaacquire(_tSema); // 1 -> 0 + _tSleeping = false; + _tWaking = false; + _tSleeping_until = 0; + go(_timer_loop); +} + +static void _timer_loop() { + while (1) { + // tick the wheel. This puts expired timers on firing list but delays + // really firing them until we release _tWheelMu. + _tWheelMu->lock(); + Tick now_t = _nanotime() / _tick_g; + Tick wnow_t = _tWheel->now(); + Tick wdt_t = now_t - wnow_t; + debugf("LOOP: now_t: %lu wnow_t: %lu δ_t %lu ...\n", now_t, wnow_t, wdt_t); + if (now_t > wnow_t) // advance(0) panics. Avoid that if we wake up earlier + _tWheel->advance(wdt_t); // inside the same tick, e.g. due to signal. + _tWheelMu->unlock(); + + // fire the timers queued on the firing list + _timer_loop_fire_queued(); + + + // go to sleep until next timer expires or wakeup comes from new arming. + // + // limit max sleeping time because contrary to other wheel operations - + // - e.g. insert and delete which are O(1), the complexity of + // ticks_to_next_event is O(time till next expiry). + Tns tsleep_max = 1*1E9; // 1s + bool sleeping = false; + + _tWheelMu->lock(); + Tick wsleep_t = _tWheel->ticks_to_next_event(tsleep_max / _tick_g); + Tick wnext_t = _tWheel->now() + wsleep_t; + + Tns tnext = wnext_t * _tick_g; + Tns tnow = _nanotime(); + + if (tnext > tnow) { + _tSleeping = sleeping = true; + _tSleeping_until = tnext; + } + _tWheelMu->unlock(); + + if (!sleeping) + continue; + + Tns tsleep = tnext - tnow; + debugf("LOOP: sleeping %.3f μs ...\n", tsleep / 1e3); + + bool acq = sync::_semaacquire_timed(_tSema, tsleep); + + // bring sleep/wakeup channel back into reset state with S=0 + _tWheelMu->lock(); + // acq ^ waking Release was done while Acquire was blocked S=0 + // acq ^ !waking impossible + // !acq ^ waking Acquire finished due to timeout; Release was done after that S=1 + // !acq ^ !waking Acquire finished due to timeout; no Release was done at all S=0 + + debugf("LOOP: woken up acq=%d waking=%d\n", acq, _tWaking); + + if ( acq && !_tWaking) { + _tWheelMu->unlock(); + panic("BUG: timer loop: woken up with acq ^ !waking"); + } + if (!acq && _tWaking) { + acq = sync::_semaacquire_timed(_tSema, 0); // S=1 -> acquire should be immediate + if (!acq) { + _tWheelMu->unlock(); + panic("BUG: timer loop: reacquire after acq ^ waking failed"); + } + } + + _tSleeping = false; + _tWaking = false; + _tSleeping_until = 0; + _tWheelMu->unlock(); + } } Timer _new_timer(double dt, func f) { - Timer t = adoptref(new _Timer()); - t->c = (f == nil ? makechan(1) : nil); - t->_f = f; - t->_dt = INFINITY; - t->_ver = 0; + _TimerImpl* _t = new _TimerImpl(); + + _t->c = (f == nil ? makechan(1) : nil); + _t->_f = f; + _t->_state = _TimerDisarmed; + _t->_tFiringNext = nil; + + Timer t = adoptref(static_cast<_Timer*>(_t)); t->reset(dt); return t; } -Timer new_timer(double dt) { - return _new_timer(dt, nil); +void _Timer::reset(double dt) { + _TimerImpl& t = *static_cast<_TimerImpl*>(this); + + if (dt <= 0) + dt = 0; + + Tns when = _nanotime() + Tns(dt*1e9); + Tick when_t = when / _tick_g + 1; // Ti covers [i-1,i)·g + + _tWheelMu->lock(); + t._mu.lock(); + if (t._state != _TimerDisarmed) { + t._mu.unlock(); + _tWheelMu->unlock(); + panic("Timer.reset: the timer is armed; must be stopped or expired"); + } + t._state = _TimerArmed; + + Tick wnow_t = _tWheel->now(); + Tick wdt_t; + if (when_t > wnow_t) + wdt_t = when_t - wnow_t; + else + wdt_t = 1; // schedule(0) panics + + // the wheel will keep a reference to the timer + t.incref(); + + _tWheel->schedule(&t._tWheelEntry, wdt_t); + t._mu.unlock(); + + // wakeup timer loop if it is sleeping until later than new timer expiry + if (_tSleeping) { + if ((when < _tSleeping_until) && !_tWaking) { + debugf("USER: waking up loop\n"); + _tWaking = true; + sync::_semarelease(_tSema); + } + } + + _tWheelMu->unlock(); } bool _Timer::stop() { - _Timer &t = *this; + _TimerImpl& t = *static_cast<_TimerImpl*>(this); bool canceled; + _tWheelMu->lock(); t._mu.lock(); - if (t._dt == INFINITY) { + switch (t._state) { + case _TimerDisarmed: canceled = false; - } - else { - t._dt = INFINITY; - t._ver += 1; + break; + + case _TimerArmed: + // timer wheel is holding this timer entry. Remove it from there. + t._tWheelEntry.cancel(); + t.decref(); + canceled = true; + break; + + case _TimerFiring: + // the timer is on "firing" list. Timer loop will process it and skip + // upon seeing ._state = _TimerDisarmed. It will also be the timer loop + // to drop the reference to the timer that timer-wheel was holding. canceled = true; + break; + + default: + panic("invalid timer state"); + } + if (canceled) + t._state = _TimerDisarmed; + // drain what _fire could have been queued already while (t.c.len() > 0) t.c.recv(); t._mu.unlock(); + _tWheelMu->unlock(); + return canceled; } -void _Timer::reset(double dt) { - _Timer &t = *this; +// when timers are fired by _tWheel.advance(), they are first popped from _tWheel and put on +// _tFiring list, so that the real firing could be done without holding _tWheelMu. +static _TimerImpl* _tFiring = nil; +static _TimerImpl* _tFiringLast = nil; + +void _TimerImpl::_queue_fire() { + _TimerImpl& t = *this; t._mu.lock(); - if (t._dt != INFINITY) { - t._mu.unlock(); - panic("Timer.reset: the timer is armed; must be stopped or expired"); - } - t._dt = dt; - t._ver += 1; - // TODO rework timers so that new timer does not spawn new goroutine. - Timer tref = newref(&t); // pass t reference to spawned goroutine - go([tref, dt](int ver) { - tref->_fire(dt, ver); - }, t._ver); + assert(t._state == _TimerArmed); + t._state = _TimerFiring; t._mu.unlock(); + + t._tFiringNext = nil; + if (_tFiring == nil) + _tFiring = &t; + if (_tFiringLast != nil) + _tFiringLast->_tFiringNext = &t; + _tFiringLast = &t; } -void _Timer::_fire(double dt, int ver) { - _Timer &t = *this; +static void _timer_loop_fire_queued() { + for (_TimerImpl* t = _tFiring; t != nil;) { + _TimerImpl* fnext = t->_tFiringNext; + t->_tFiringNext = nil; + t->_fire(); - sleep(dt); - t._mu.lock(); - if (t._ver != ver) { - t._mu.unlock(); - return; // the timer was stopped/resetted - don't fire it + t->decref(); // wheel was holding a reference to the timer + t = fnext; } - t._dt = INFINITY; + _tFiring = nil; + _tFiringLast = nil; +} - // send under ._mu so that .stop can be sure that if it sees - // ._dt = INFINITY, there is no ongoing .c send. - if (t._f == nil) { - t.c.send(now()); - t._mu.unlock(); - return; +void _TimerImpl::_fire() { + _TimerImpl& t = *this; + + bool fire = false; + t._mu.lock(); + if (t._state == _TimerFiring) { // stop could disarm the timer in the meantime + t._state = _TimerDisarmed; + fire = true; + + debugf("LOOP: firing @ %lu ...\n", t._tWheelEntry.scheduled_at()); + + // send under ._mu so that .stop can be sure that if it sees + // ._state = _TimerDisarmed, there is no ongoing .c send. + if (t._f == nil) + t.c.send(now()); } t._mu.unlock(); // call ._f not from under ._mu not to deadlock e.g. if ._f wants to reset the timer. - t._f(); + if (fire && t._f != nil) + t._f(); } }} // golang::time:: diff --git a/golang/time.h b/golang/time.h index 2e687f2..18e8b93 100644 --- a/golang/time.h +++ b/golang/time.h @@ -1,7 +1,7 @@ #ifndef _NXD_LIBGOLANG_TIME_H #define _NXD_LIBGOLANG_TIME_H -// Copyright (C) 2019-2023 Nexedi SA and Contributors. +// Copyright (C) 2019-2024 Nexedi SA and Contributors. // Kirill Smelkov // // This program is free software: you can Use, Study, Modify and Redistribute @@ -118,6 +118,7 @@ struct _Ticker : object { double _dt; sync::Mutex _mu; bool _stop; + Timer _timer; // don't new - create only via new_ticker() private: @@ -147,18 +148,12 @@ LIBGOLANG_API Timer new_timer(double dt); struct _Timer : object { chan c; -private: - func _f; - - sync::Mutex _mu; - double _dt; // +inf - stopped, otherwise - armed - int _ver; // current timer was armed by n'th reset - // don't new - create only via new_timer() & co private: _Timer(); ~_Timer(); friend Timer _new_timer(double dt, func f); + friend class _TimerImpl; public: LIBGOLANG_API void decref(); @@ -182,9 +177,6 @@ struct _Timer : object { // // the timer must be either already stopped or expired. LIBGOLANG_API void reset(double dt); - -private: - void _fire(double dt, int ver); }; diff --git a/golang/time_test.py b/golang/time_test.py index e72c597..056367a 100644 --- a/golang/time_test.py +++ b/golang/time_test.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2019 Nexedi SA and Contributors. -# Kirill Smelkov +# Copyright (C) 2019-2024 Nexedi SA and Contributors. +# Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute # it under the terms of the GNU General Public License version 3, or (at your @@ -20,9 +20,10 @@ from __future__ import print_function, absolute_import -from golang import select -from golang import time +from golang import select, func, defer +from golang import time, sync from golang.golang_test import panics +from six.moves import range as xrange # all timer tests operate in dt units dt = 10*time.millisecond @@ -65,6 +66,7 @@ def test_ticker_time(): # test_timer verifies that Timer/Ticker fire as expected. +@func def test_timer(): # start timers at x5, x7 and x11 intervals an verify that the timers fire # in expected sequence. The times when the timers fire do not overlap in @@ -73,15 +75,15 @@ def test_timer(): tv = [] # timer events Tstart = time.now() - t23 = time.Timer(23*dt) - t5 = time.Timer( 5*dt) + t23 = time.Timer(23*dt); defer(t23.stop) + t5 = time.Timer( 5*dt); defer(t5 .stop) def _(): tv.append(7) t7f.reset(7*dt) - t7f = time.Timer( 7*dt, f=_) + t7f = time.Timer( 7*dt, f=_); defer(t7f.stop) - tx11 = time.Ticker(11*dt) + tx11 = time.Ticker(11*dt); defer(tx11.stop) while 1: _, _rx = select( @@ -108,19 +110,20 @@ def _(): # test_timer_misc, similarly to test_timer, verifies misc timer convenience functions. +@func def test_timer_misc(): tv = [] Tstart = time.now() - c23 = time.after(23*dt) - c5 = time.after( 5*dt) + c23 = time.after(23*dt) # cannot stop + c5 = time.after( 5*dt) # cannot stop def _(): tv.append(7) t7f.reset(7*dt) - t7f = time.after_func(7*dt, _) + t7f = time.after_func(7*dt, _); defer(t7f.stop) - cx11 = time.tick(11*dt) + cx11 = time.tick(11*dt) # cannot stop while 1: _, _rx = select( @@ -148,13 +151,14 @@ def _(): # test_timer_stop verifies that .stop() cancels Timer or Ticker. +@func def test_timer_stop(): tv = [] - t10 = time.Timer (10*dt) - t2 = time.Timer ( 2*dt) # will fire and cancel t3, tx5 - t3 = time.Timer ( 3*dt) # will be canceled - tx5 = time.Ticker( 5*dt) # will be canceled + t10 = time.Timer (10*dt); defer(t10.stop) + t2 = time.Timer ( 2*dt); defer(t2 .stop) # will fire and cancel t3, tx5 + t3 = time.Timer ( 3*dt); defer(t3 .stop) # will be canceled + tx5 = time.Ticker( 5*dt); defer(tx5.stop) # will be canceled while 1: _, _rx = select( @@ -180,9 +184,10 @@ def test_timer_stop(): # test_timer_stop_drain verifies that Timer/Ticker .stop() drains timer channel. +@func def test_timer_stop_drain(): - t = time.Timer (1*dt) - tx = time.Ticker(1*dt) + t = time.Timer (1*dt); defer(t.stop) + tx = time.Ticker(1*dt); defer(tx.stop) time.sleep(2*dt) assert len(t.c) == 1 @@ -195,9 +200,45 @@ def test_timer_stop_drain(): assert len(tx.c) == 0 +# test_timer_stop_vs_func verifies that Timer .stop() works correctly with func-timer. +@func +def test_timer_stop_vs_func(): + tv = [] + def _1(): tv.append(1) + def _2(): tv.append(2) + + t1 = time.after_func(1e6*dt, _1); defer(t1.stop) + t2 = time.after_func( 1*dt, _2); defer(t2.stop) + + time.sleep(2*dt) + assert t1.stop() == True + assert t2.stop() == False + assert tv == [2] + + # test_timer_reset_armed verifies that .reset() panics if called on armed timer. +@func def test_timer_reset_armed(): # reset while armed - t = time.Timer(10*dt) + t = time.Timer(10*dt); defer(t.stop) with panics("Timer.reset: the timer is armed; must be stopped or expired"): t.reset(5*dt) + + +# bench_timer_arm_cancel benchmarks arming timers that do not fire. +# it shows how cheap or expensive it is to use timers to implement timeouts. +def bench_timer_arm_cancel(b): + for i in xrange(b.N): + t = time.Timer(10*time.second) + _ = t.stop() + assert _ is True + + +# bench_timer_arm_fire benchmarks arming timers that do fire. +# it shows what it costs to go through all steps related to timer loop and firing timers. +def bench_timer_arm_fire(b): + wg = sync.WaitGroup() + wg.add(b.N) + for i in xrange(b.N): + t = time.after_func(1*time.millisecond, wg.done) + wg.wait() diff --git a/setup.py b/setup.py index 9ef79dc..8d675b8 100644 --- a/setup.py +++ b/setup.py @@ -188,7 +188,7 @@ def install_egg_scripts(self, dist): # requirements of packages under "golang." namespace R = { - 'cmd.pybench': {'pytest', 'py'}, + 'cmd.pybench': {'pytest', 'py ; python_version >= "3"'}, 'pyx.build': {'setuptools', 'wheel', 'cython < 3', 'setuptools_dso >= 2.8'}, 'x.perf.benchlib': {'numpy'}, } @@ -467,8 +467,11 @@ def defif(name, ok): 'golang/os/signal.h', 'golang/strings.h', 'golang/sync.h', - 'golang/time.h'], - include_dirs = ['3rdparty/include'], + 'golang/time.h', + '3rdparty/ratas/src/timer-wheel.h'], + include_dirs = [ + '3rdparty/include', + '3rdparty/ratas/src'], define_macros = [('BUILDING_LIBGOLANG', None)], soversion = '0.1'), @@ -604,9 +607,6 @@ def defif(name, ok): Programming Language :: Python :: 2 Programming Language :: Python :: 2.7 Programming Language :: Python :: 3 - Programming Language :: Python :: 3.5 - Programming Language :: Python :: 3.6 - Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 diff --git a/tox.ini b/tox.ini index 6833dce..dcd823d 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,6 @@ [tox] envlist = - {py27d,py27,py37,py38,py39d,py39,py310d,py310,py311d,py311,py312,pypy,pypy3}-{thread,gevent} + {py27d,py27,py38,py39d,py39,py310d,py310,py311d,py311,py312,pypy,pypy3}-{thread,gevent} # ThreadSanitizer @@ -10,24 +10,23 @@ envlist = # (*) PyPy locks its GIL (see RPyGilAcquire) by manually doing atomic cmpxchg # and other games, which TSAN cannot see if PyPy itself was not compiled with # -fsanitize=thread. - {py27d,py27,py37,py38,py39d,py39,py310d,py310,py311d,py311,py312 }-{thread }-tsan + {py27d,py27,py38,py39d,py39,py310d,py310,py311d,py311,py312 }-{thread }-tsan # XXX py*-gevent-tsan would be nice to have, but at present TSAN is not # effective with gevent, because it does not understand greenlet "thread" # switching and so perceives the program as having only one thread where races # are impossible. Disabled to save time. -# {py27d,py27,py37,py38,py39d,py39,py310d,py310,py311d,py311,py312 }-{ gevent}-tsan +# {py27d,py27,py38,py39d,py39,py310d,py310,py311d,py311,py312 }-{ gevent}-tsan # AddressSanitizer # XXX asan does not work with gevent: https://github.com/python-greenlet/greenlet/issues/113 - {py27d,py27,py37,py38,py39d,py39,py310d,py310,py311d,py311,py312,pypy,pypy3}-{thread }-asan + {py27d,py27,py38,py39d,py39,py310d,py310,py311d,py311,py312,pypy,pypy3}-{thread }-asan [testenv] basepython = py27d: python2.7-dbg py27: python2.7 - py37: python3.7 py38: python3.8 py39d: python3.9-dbg py39: python3.9 @@ -43,16 +42,16 @@ basepython = setenv = # distutils take CFLAGS for both C and C++. # distutils use CFLAGS also at link stage -> we don't need to set LDFLAGS separately. - tsan: CFLAGS=-g -fsanitize=thread - asan: CFLAGS=-g -fsanitize=address + tsan: CFLAGS=-g -fsanitize=thread -fno-omit-frame-pointer + asan: CFLAGS=-g -fsanitize=address -fno-omit-frame-pointer # XXX however distutils' try_link, which is used by numpy.distutils use only CC # as linker without CFLAGS and _without_ LDFLAGS, which fails if *.o were # compiled with -fsanitize=X and linked without that option. Work it around # with also adjusting CC. # XXX better arrange to pass CFLAGS to pygolang only, e.g. by adding --race or # --sanitize=thread to `setup.py build_ext`. - tsan: CC=cc -fsanitize=thread - asan: CC=cc -fsanitize=address + tsan: CC=cc -fsanitize=thread -fno-omit-frame-pointer + asan: CC=cc -fsanitize=address -fno-omit-frame-pointer # always compile pygolang from source and don't reuse binary pygolang wheels as # we compile each case with different CFLAGS. @@ -76,3 +75,5 @@ commands= # likewise for python debug builds. asan,tsan,py{27,39,310,311,312}d: -s \ gpython/ golang/ + +allowlist_externals={toxinidir}/trun diff --git a/trun b/trun index 727e063..9d490fa 100755 --- a/trun +++ b/trun @@ -1,4 +1,5 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- # Copyright (C) 2019-2024 Nexedi SA and Contributors. # Kirill Smelkov # @@ -34,7 +35,7 @@ trun cares to run python with LD_PRELOAD set appropriately to /path/to/libtsan.s from __future__ import print_function, absolute_import -import os, sys, re, subprocess, types +import os, os.path, sys, re, subprocess, platform, types PY3 = (bytes is not str) if PY3: from importlib import machinery as imp_machinery @@ -87,6 +88,7 @@ def main(): # determine if _golang.so is linked to a sanitizer, and if yes, to which # particular sanitizer DSO. Set LD_PRELOAD appropriately. + libxsan = None ld_preload = None if 'linux' in sys.platform: p = subprocess.Popen(["ldd", _golang_so.path], stdout=subprocess.PIPE) @@ -127,7 +129,8 @@ def main(): _ = grep1("DYLD_INSERT_LIBRARIES=(.*)$", err) if _ is not None: - ld_preload = ("DYLD_INSERT_LIBRARIES", _.group(1)) + libxsan = _.group(1) + ld_preload = ("DYLD_INSERT_LIBRARIES", libxsan) else: print("trun %r: `import golang` failed with unexpected error:" % sys.argv[1:], file=sys.stderr) print(err, file=sys.stderr) @@ -144,7 +147,7 @@ def main(): env_prepend("TSAN_OPTIONS", "halt_on_error=1") env_prepend("ASAN_OPTIONS", "halt_on_error=1") - # tweak TSAN/ASAN defaults: + # tweak TSAN/ASAN/LSAN defaults: # enable TSAN deadlock detector # (unfortunately it caughts only few _potential_ deadlocks and actually @@ -152,15 +155,49 @@ def main(): env_prepend("TSAN_OPTIONS", "detect_deadlocks=1") env_prepend("TSAN_OPTIONS", "second_deadlock_stack=1") - # many python allocations, whose lifetime coincides with python interpreter - # lifetime and which are not explicitly freed on python shutdown, are - # reported as leaks. Disable leak reporting to avoid huge non-pygolang - # related printouts. - env_prepend("ASAN_OPTIONS", "detect_leaks=0") - # tune ASAN to check more aggressively by default env_prepend("ASAN_OPTIONS", "detect_stack_use_after_return=1") + # enable ASAN/LSAN leak detector. + # + # Do it only on CPython ≥ 3.11 because on py2 and on earlier py3 versions + # there are many many python allocations, whose lifetime coincide with + # python interpreter lifetime, and which are not explicitly freed on python + # shutdown. For py3 they significantly improved this step by step and + # starting from 3.11 it becomes practical to silence some still-leaks with + # suppressions, while for earlier py3 versions and especially for py2 it + # is, unfortunately, not manageable. Do not spend engineering time with + # activating LSAN on PyPy as that is tier 2 platform and bug tail history + # of memory leaks is very long even only on cpython. + if sys.version_info < (3,11): + env_prepend("ASAN_OPTIONS", "detect_leaks=0") + if libxsan is not None: + if 'asan' in libxsan.lower(): + print("W: trun %r: asan: leak detection deactivated on %s %s" % ( + sys.argv[1:], platform.python_implementation(), platform.python_version()), + file=sys.stderr) + else: + env_prepend("ASAN_OPTIONS", "detect_leaks=1") + env_prepend("LSAN_OPTIONS", "suppressions=%s" % os.path.abspath(os.path.join( + os.path.dirname(__file__), ".lsan-ignore.txt"))) + # do not print statistics for suppressed leaks - else it breaks tests that verify program output + env_prepend("LSAN_OPTIONS", "print_suppressions=0") + + # enable DWARF-based unwinding. + # else, if python is not compiled with -fno-omit-frame-pointer, it can show + # the whole traceback as e.g. just + # Direct leak of 32 byte(s) in 1 object(s) allocated from: + # #0 0x7f88522f3bd7 in malloc ../../../../src/libsanitizer/asan/asan_malloc_linux.cpp:69 + # #1 0x55f910a3d9a4 in PyThread_allocate_lock Python/thread_pthread.h:385 + # and our leak suppressions won't work. + # this is slower compared to default frame-pointer based unwinding, but + # still works reasonably timely when run with just tests. + env_prepend("ASAN_OPTIONS", "fast_unwind_on_malloc=0") + # leak suppression also needs full tracebacks to work correctly, since with + # python there are many levels of call nesting at C level, and to filter-out e.g. + # top-level PyImport_Import we need to go really deep. + env_prepend("ASAN_OPTIONS", "malloc_context_size=255") + # exec `...` os.execvp(sys.argv[1], sys.argv[1:]) From 2bb971ba618fc3fbfdbbbd6f855a606c961bf6d9 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Tue, 7 May 2024 14:34:34 +0300 Subject: [PATCH 23/29] X golang_str: Adjust bstr/ustr .encode() and .__bytes__ to leave string domain into bytes Initially I implemented things in such a way that (b|u)str.__bytes__ were giving bstr and ustr.encode() was giving bstr as well. My logic here was that bstr is based on bytes and it is ok to give that. However this logic did not pass backward compatibility test: for example when LXML is imported it does cdef bytes _FILENAME_ENCODING = (sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii').encode("UTF-8") and under gpython it breaks with File "/srv/slapgrid/slappart47/srv/runner/software/7f1663e8148f227ce3c6a38fc52796e2/bin/runwsgi", line 4, in from Products.ERP5.bin.zopewsgi import runwsgi; sys.exit(runwsgi()) File "/srv/slapgrid/slappart47/srv/runner/software/7f1663e8148f227ce3c6a38fc52796e2/parts/erp5/product/ERP5/__init__.py", line 36, in from Products.ERP5Type.Utils import initializeProduct, updateGlobals File "/srv/slapgrid/slappart47/srv/runner/software/7f1663e8148f227ce3c6a38fc52796e2/parts/erp5/product/ERP5Type/__init__.py", line 42, in from .patches import pylint File "/srv/slapgrid/slappart47/srv/runner/software/7f1663e8148f227ce3c6a38fc52796e2/parts/erp5/product/ERP5Type/patches/pylint.py", line 524, in __import__(module_name, fromlist=[module_name], level=0)) File "src/lxml/sax.py", line 18, in init lxml.sax File "src/lxml/etree.pyx", line 154, in init lxml.etree TypeError: Expected bytes, got golang.bstr The breakage highlights a thinko in my previous reasoning: yes bstr is based on bytes, but bstr has different semantics compared to bytes: even though e.g. __getitem__ works the same way for bytes on py2, it works differently compared to py3. This way if on py3 a program is doing bytes(x) or x.encode() it then expects the result to have bytes semantics of current python which is not the case if the result is bstr. -> Fix that by adjusting .encode() and .__bytes__() to produce bytes type of current python and leave string domain. I initially was contemplating for some time to introduce a third type, e.g. bvec also based on bytes, but having bytes semantic and that bvec.decode would return back to pygolang strings domain. But due to the fact that bytes semantic is different in between py2 and py3, it would mean that bvec provided by pygolang would need to have different behaviours dependent on current python version which is undesirable. In the end with leaving into native bytes the "bytes inconsistency" problem is left to remain under std python with pygolang targeting only to fix strings inconsistency in between py2 and py3 and providing the same semantic for bstr and ustr on all python versions. It also does not harm that bytes.decode() returns std unicode instead of str: for programs that run under unpatched python we have u() to convert the result to ustr, while under gpython std unicode is actually ustr which makes bytes.decode() behaviour still quite ok. P.S. we enable bstr.encode for consistency and because under py2, if not enabled, it will break when running pytest under gpython in File ".../_pytest/assertion/rewrite.py", line 352, in RN = "\r\n".encode("utf-8") AttributeError: unreadable attribute --- golang/_golang_str.pyx | 78 ++++++++++++++++++++++++++------------- golang/golang_str_test.py | 54 +++++++++++++++++---------- 2 files changed, 86 insertions(+), 46 deletions(-) diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index 6172711..46b0a45 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -106,6 +106,7 @@ from cython cimport no_gc from libc.stdio cimport FILE from golang cimport strconv +import codecs as pycodecs import string as pystring import types as pytypes import functools as pyfunctools @@ -343,9 +344,12 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 # _pybstr.__new__ is hand-made in _pybstr_tp_new which invokes ↑ _pybstr__new__() . - def __bytes__(self): return pyb(self) # see __str__ - def __unicode__(self): return pyu(self) + # __bytes__ converts string to bytes leaving string domain. + # NOTE __bytes__ and encode are the only operations that leave string domain. + # NOTE __bytes__ is used only by py3 and only for `bytes(obj)` and `b'%s/%b' % obj`. + def __bytes__(self): return _bdata(self) # -> bytes + def __unicode__(self): return pyu(self) def __str__(self): if PY_MAJOR_VERSION >= 3: return pyu(self) @@ -482,13 +486,32 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 # encode/decode - def decode(self, encoding=None, errors=None): - if encoding is None and errors is None: - encoding = 'utf-8' # NOTE always UTF-8, not sys.getdefaultencoding - errors = 'surrogateescape' - else: - if encoding is None: encoding = 'utf-8' - if errors is None: errors = 'strict' + # + # Encoding strings - both bstr and ustr - convert type to bytes leaving string domain. + # + # Encode treats bstr and ustr as string, encoding unicode representation of + # the string to bytes. For bstr it means that the string representation is + # first converted to unicode and encoded to bytes from there. For ustr + # unicode representation of the string is directly encoded. + # + # Decoding strings is not provided. However for bstr the decode is provided + # treating input data as raw bytes and producing ustr as the result. + # + # NOTE __bytes__ and encode are the only operations that leave string domain. + def encode(self, encoding=None, errors=None): # -> bytes + encoding, errors = _encoding_with_defaults(encoding, errors) + + # on py2 e.g. bytes.encode('string-escape') works on bytes directly + if PY_MAJOR_VERSION < 3: + codec = pycodecs.lookup(encoding) + if not codec._is_text_encoding or \ + encoding in ('string-escape',): # string-escape also works on bytes + return codec.encode(self, errors)[0] + + return pyu(self).encode(encoding, errors) + + def decode(self, encoding=None, errors=None): # -> ustr | bstr on py2 for encodings like string-escape + encoding, errors = _encoding_with_defaults(encoding, errors) if encoding == 'utf-8' and errors == 'surrogateescape': x = _utf8_decode_surrogateescape(self) @@ -499,11 +522,6 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 return pyb(x) return pyu(x) - if PY_MAJOR_VERSION < 3: - # whiteout encode inherited from bytes - # TODO ideally whiteout it in such a way that bstr.encode also raises AttributeError - encode = property(doc='bstr has no encode') - # all other string methods @@ -667,9 +685,11 @@ cdef class _pyustr(unicode): # _pyustr.__new__ is hand-made in _pyustr_tp_new which invokes ↑ _pyustr__new__() . - def __bytes__(self): return pyb(self) - def __unicode__(self): return pyu(self) # see __str__ + # __bytes__ converts string to bytes leaving string domain. + # see bstr.__bytes__ for more details. + def __bytes__(self): return _bdata(pyb(self)) # -> bytes + def __unicode__(self): return pyu(self) # see __str__ def __str__(self): if PY_MAJOR_VERSION >= 3: return pyu(self) # = self or pyustr if it was subclass @@ -793,20 +813,15 @@ cdef class _pyustr(unicode): return pyu(zunicode.__format__(self, format_spec)) - # encode/decode - def encode(self, encoding=None, errors=None): - if encoding is None and errors is None: - encoding = 'utf-8' # NOTE always UTF-8, not sys.getdefaultencoding - errors = 'surrogateescape' - else: - if encoding is None: encoding = 'utf-8' - if errors is None: errors = 'strict' + # encode/decode (see bstr for details) + def encode(self, encoding=None, errors=None): # -> bytes + encoding, errors = _encoding_with_defaults(encoding, errors) if encoding == 'utf-8' and errors == 'surrogateescape': x = _utf8_encode_surrogateescape(self) else: x = zunicode.encode(self, encoding, errors) - return pyb(x) + return x if PY_MAJOR_VERSION < 3: # whiteout decode inherited from unicode @@ -1987,6 +2002,18 @@ cdef extern from "Python.h": # ---- UTF-8 encode/decode ---- +# _encoding_with_defaults returns encoding and errors substituted with defaults +# as needed for functions like ustr.encode and bstr.decode . +cdef _encoding_with_defaults(encoding, errors): # -> (encoding, errors) + if encoding is None and errors is None: + encoding = 'utf-8' # NOTE always UTF-8, not sys.getdefaultencoding + errors = 'surrogateescape' + else: + if encoding is None: encoding = 'utf-8' + if errors is None: errors = 'strict' + return (encoding, errors) + + # TODO(kirr) adjust UTF-8 encode/decode surrogateescape(*) a bit so that not # only bytes -> unicode -> bytes is always identity for any bytes (this is # already true), but also that unicode -> bytes -> unicode is also always true @@ -2238,7 +2265,6 @@ cdef _patch_str(): # XXX explain bpreserve_slots = upreserve_slots = ("maketrans",) if PY_MAJOR_VERSION < 3: - bpreserve_slots += ("encode",) # @property'ies upreserve_slots += ("decode",) # patch unicode to be pyustr. This patches diff --git a/golang/golang_str_test.py b/golang/golang_str_test.py index 6f88ad5..ea7456b 100644 --- a/golang/golang_str_test.py +++ b/golang/golang_str_test.py @@ -231,13 +231,15 @@ def test_strings_basic(): assert b(bs) is bs; assert bstr(bs) is bs assert u(us) is us; assert ustr(us) is us - # bytes(b(·)) = identity, unicode(u(·)) = identity - assert bytes (bs) is bs + # unicode(u(·)) = identity assert unicode(us) is us - # unicode(b) -> u, bytes(u) -> b + # unicode(b) -> u _ = unicode(bs); assert type(_) is ustr; assert _ == "мир" - _ = bytes (us); assert type(_) is bstr; assert _ == "мир" + + # bytes(b|u) -> bytes + _ = bytes(bs); assert type(_) is x32(bytes, bstr); assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80' + _ = bytes(us); assert type(_) is x32(bytes, bstr); assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80' # bytearray(b|u) -> bytearray _ = bytearray(bs); assert type(_) is bytearray; assert _ == b'\xd0\xbc\xd0\xb8\xd1\x80' @@ -651,14 +653,13 @@ def test_strings_encodedecode(): us = u('мир') bs = b('май') + _ = us.encode(); assert type(_) is bytes; assert _ == xbytes('мир') + _ = us.encode('utf-8'); assert type(_) is bytes; assert _ == xbytes('мир') + _ = bs.encode(); assert type(_) is bytes; assert _ == xbytes('май') + _ = bs.encode('utf-8'); assert type(_) is bytes; assert _ == xbytes('май') + # TODO also raise AttributeError on .encode/.decode lookup on classes - assert hasattr(us, 'encode') ; assert hasattr(ustr, 'encode') - assert not hasattr(bs, 'encode') #; assert not hasattr(bstr, 'encode') assert not hasattr(us, 'decode') #; assert not hasattr(ustr, 'decode') - assert hasattr(bs, 'decode') ; assert hasattr(bstr, 'decode') - - _ = us.encode(); assert type(_) is bstr; assert _bdata(_) == xbytes('мир') - _ = us.encode('utf-8'); assert type(_) is bstr; assert _bdata(_) == xbytes('мир') _ = bs.decode(); assert type(_) is ustr; assert _udata(_) == u'май' _ = bs.decode('utf-8'); assert type(_) is ustr; assert _udata(_) == u'май' @@ -673,10 +674,10 @@ def test_strings_encodedecode(): assert type(_) is ustr assert _udata(_) == u'мир' - b_cpmir = us.encode('cp1251') - assert type(b_cpmir) is bstr - assert _bdata(b_cpmir) == u'мир'.encode('cp1251') - assert _bdata(b_cpmir) == b'\xec\xe8\xf0' + cpmir = us.encode('cp1251') + assert type(cpmir) is bytes + assert cpmir == u'мир'.encode('cp1251') + assert cpmir == b'\xec\xe8\xf0' # decode/encode errors u_k8mir = b_k8mir.decode() # no decode error with @@ -697,11 +698,14 @@ def test_strings_encodedecode(): us.encode('ascii') _ = u_k8mir.encode() # no encode error with - assert type(_) is bstr # default parameters - assert _bdata(_) == k8mir + assert type(_) is bytes # default parameters + assert _ == k8mir _ = u_k8mir.encode('utf-8', 'surrogateescape') # no encode error with - assert type(_) is bstr # explicit utf-8/surrogateescape - assert _bdata(_) == k8mir + assert type(_) is bytes # explicit utf-8/surrogateescape + assert _ == k8mir + _ = b_k8mir.encode() # bstr.encode = bstr -> ustr -> encode + assert type(_) is bytes + assert _ == k8mir # on py2 unicode.encode accepts surrogate pairs and does not complain # TODO(?) manually implement encode/py2 and reject surrogate pairs by default @@ -724,6 +728,14 @@ def test_strings_encodedecode(): _ = b(r'x\'y').decode('string-escape'); assert type(_) is bstr; assert _bdata(_) == b"x'y" _ = b('616263').decode('hex'); assert type(_) is bstr; assert _bdata(_) == b"abc" + # similarly for bytes.encode + if six.PY3: + with raises(LookupError): bs.encode('hex') + with raises(LookupError): bs.encode('string-escape') + else: + _ = bs.encode('hex'); assert type(_) is bytes; assert _ == b'd0bcd0b0d0b9' + _ = bs.encode('string-escape'); assert type(_) is bytes; assert _ == br'\xd0\xbc\xd0\xb0\xd0\xb9' + # verify string operations like `x * 3` for all cases from bytes, bytearray, unicode, bstr and ustr. @mark.parametrize('tx', (bytes, unicode, bytearray, bstr, ustr)) @@ -1418,6 +1430,8 @@ def M(fmt, args, ok): M("α %s π", BB(xbytes('мир2')) , "α байты π") # not мир2 # vvv does not work on py3 as b'' % b'' does not consult __str__ nor __bytes__ of the argument # even though it is not 100% we are ok here, because customizing bytes or unicode is very exotic + # + # XXX the code in bytesobject.c::format_obj tells different -> recheck. if six.PY2: M("α %s π", (BB(xbytes('мир2')),) , "α байты π") # not мир2 M("α %s π", [BB(xbytes('мир2'))] , "α [BB(байты)] π") # not [мир2] @@ -1884,8 +1898,8 @@ class MyStr(tx): # for bstr/ustr __bytes__/__unicode__ return *str, never MyStr # (builtin unicode has no __bytes__/__unicode__) if tx is not unicode: - _ = xx.__bytes__(); assert type(_) is bstr; assert _ == 'мир' - _ = xx.__unicode__(); assert type(_) is ustr; assert _ == 'мир' + _ = xx.__bytes__(); assert type(_) is bytes; assert _ == xbytes('мир') + _ = xx.__unicode__(); assert type(_) is ustr; assert _ == 'мир' # subclass with __str__ From cb0e6055712b967ef45ef7f607f31f2e8c083420 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Tue, 7 May 2024 14:56:14 +0300 Subject: [PATCH 24/29] X golang_str: Fix iter(bstr) to yield byte instead of unicode character Things were initially implemented to follow Go semantic exactly with bytestring iteration yielding unicode characters as explained in https://blog.golang.org/strings. However this makes bstr not a 100% drop-in compatible replacement for std str under py2, and even though my initial testing was saying this change does not affect programs in practice it turned out to be not the case. For example with bstr.__iter__ yielding unicode characters running gpython on py2 will break sometimes when importing uuid: There uuid reads 16 bytes from /dev/random and then wants to iterate those 16 bytes as single bytes and then expects that the length of the resulting sequence is exactly 16: int = long(('%02x'*16) % tuple(map(ord, bytes)), 16) ( https://github.com/python/cpython/blob/2.7-0-g8d21aa21f2c/Lib/uuid.py#L147 ) which breaks if some of the read bytes are higher than 0x7f. Even though this particular problem could be worked-around with patching uuid, there is no evidence that there will be no similar problems later, which could be many. -> So adjust bstr semantic instead to follow semantic of str under py2 and introduce uiter() primitive to still be able to iterate bytestrings as unicode characters. This makes bstr, hopefully, to be fully compatible with str on py2 while still providing reasonably good approach for strings processing the Go-way when needed. Add biter as well for symmetry. --- README.rst | 16 +++++---- golang/__init__.py | 11 ++++--- golang/_golang_str.pyx | 69 +++++++++++++++++++++++++++------------ golang/golang_str_test.py | 49 ++++++++++++++------------- gpython/gpython_test.py | 2 ++ 5 files changed, 93 insertions(+), 54 deletions(-) diff --git a/README.rst b/README.rst index 80846e7..03d19c0 100644 --- a/README.rst +++ b/README.rst @@ -241,12 +241,16 @@ The conversion, in both encoding and decoding, never fails and never looses information: `bstr→ustr→bstr` and `ustr→bstr→ustr` are always identity even if bytes data is not valid UTF-8. +Both `bstr` and `ustr` represent stings. They are two different *representations* of the same entity. + Semantically `bstr` is array of bytes, while `ustr` is array of -unicode-characters. Accessing their elements by `[index]` yields byte and -unicode character correspondingly [*]_. Iterating them, however, yields unicode -characters for both `bstr` and `ustr`. In practice `bstr` is enough 99% of the -time, and `ustr` only needs to be used for random access to string characters. -See `Strings, bytes, runes and characters in Go`__ for overview of this approach. +unicode-characters. Accessing their elements by `[index]` and iterating them yield byte and +unicode character correspondingly [*]_. However it is possible to yield unicode +character when iterating `bstr` via `uiter`, and to yield byte character when +iterating `ustr` via `biter`. In practice `bstr` + `uiter` is enough 99% of +the time, and `ustr` only needs to be used for random access to string +characters. See `Strings, bytes, runes and characters in Go`__ for overview of +this approach. __ https://blog.golang.org/strings @@ -267,7 +271,7 @@ Usage example:: s = b('привет') # s is bstr corresponding to UTF-8 encoding of 'привет'. s += ' мир' # s is b('привет мир') - for c in s: # c will iterate through + for c in uiter(s): # c will iterate through ... # [u(_) for _ in ('п','р','и','в','е','т',' ','м','и','р')] # the following gives b('привет мир труд май') diff --git a/golang/__init__.py b/golang/__init__.py index 00babf6..9b90797 100644 --- a/golang/__init__.py +++ b/golang/__init__.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright (C) 2018-2023 Nexedi SA and Contributors. +# Copyright (C) 2018-2024 Nexedi SA and Contributors. # Kirill Smelkov # # This program is free software: you can Use, Study, Modify and Redistribute @@ -24,7 +24,7 @@ - `func` allows to define methods separate from class. - `defer` allows to schedule a cleanup from the main control flow. - `error` and package `errors` provide error chaining. -- `b`, `u` and `bstr`/`ustr` provide uniform UTF8-based approach to strings. +- `b`, `u`, `bstr`/`ustr` and `biter`/`uiter` provide uniform UTF8-based approach to strings. - `gimport` allows to import python modules by full path in a Go workspace. See README for thorough overview. @@ -36,7 +36,8 @@ __version__ = "0.1" __all__ = ['go', 'chan', 'select', 'default', 'nilchan', 'defer', 'panic', - 'recover', 'func', 'error', 'b', 'u', 'bstr', 'ustr', 'bbyte', 'uchr', 'gimport'] + 'recover', 'func', 'error', 'b', 'u', 'bstr', 'ustr', 'biter', 'uiter', 'bbyte', 'uchr', + 'gimport'] import setuptools_dso setuptools_dso.dylink_prepare_dso('golang.runtime.libgolang') @@ -323,4 +324,6 @@ def _emit_exc_context(exc, emitf, recursef): pybbyte as bbyte, \ pyu as u, \ pyustr as ustr, \ - pyuchr as uchr + pyuchr as uchr, \ + pybiter as biter, \ + pyuiter as uiter diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index 46b0a45..137e302 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -141,7 +141,7 @@ cpdef pyb(s): # -> bstr b(u(bytes_input)) is bstr with the same data as bytes_input. - See also: u, bstr/ustr. + See also: u, bstr/ustr, biter/uiter. """ bs = _pyb(pybstr, s) if bs is None: @@ -164,7 +164,7 @@ cpdef pyu(s): # -> ustr u(b(unicode_input)) is ustr with the same data as unicode_input. - See also: b, bstr/ustr. + See also: b, bstr/ustr, biter/uiter. """ us = _pyu(pyustr, s) if us is None: @@ -280,8 +280,6 @@ cdef __pystr(object obj): # -> ~str return pyb(obj) -# XXX -> bchr ? (not good as "character" means "unicode character") -# -> bstr.chr ? def pybbyte(int i): # -> 1-byte bstr """bbyte(i) returns 1-byte bstr with ordinal i.""" return pyb(bytearray([i])) @@ -318,11 +316,11 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 is always identity even if bytes data is not valid UTF-8. - Semantically bstr is array of bytes. Accessing its elements by [index] - yields byte character. Iterating through bstr, however, yields unicode - characters. In practice bstr is enough 99% of the time, and ustr only - needs to be used for random access to string characters. See - https://blog.golang.org/strings for overview of this approach. + Semantically bstr is array of bytes. Accessing its elements by [index] and + iterating it yield byte character. However it is possible to yield unicode + character when iterating bstr via uiter. In practice bstr + uiter is enough + 99% of the time, and ustr only needs to be used for random access to string + characters. See https://blog.golang.org/strings for overview of this approach. Operations in between bstr and ustr/unicode / bytes/bytearray coerce to bstr. When the coercion happens, bytes and bytearray, similarly to bstr, are also @@ -337,7 +335,7 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 to bstr. See b for details. - otherwise bstr will have string representation of the object. - See also: b, ustr/u. + See also: b, ustr/u, biter/uiter. """ # XXX due to "cannot `cdef class` with __new__" (https://github.com/cython/cython/issues/799) @@ -414,10 +412,13 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 else: return pyb(x) - # __iter__ - yields unicode characters + # __iter__ def __iter__(self): - # TODO iterate without converting self to u - return pyu(self).__iter__() + if PY_MAJOR_VERSION >= 3: + return _pybstrIter(zbytes.__iter__(self)) + else: + # on python 2 str does not have .__iter__ + return PySeqIter_New(self) # __contains__ @@ -668,8 +669,8 @@ cdef class _pyustr(unicode): elements by [index] yields unicode characters. ustr complements bstr and is meant to be used only in situations when - random access to string characters is needed. Otherwise bstr is more - preferable and should be enough 99% of the time. + random access to string characters is needed. Otherwise bstr + uiter is + more preferable and should be enough 99% of the time. Operations in between ustr and bstr/bytes/bytearray / unicode coerce to ustr. When the coercion happens, bytes and bytearray, similarly to bstr, are also @@ -678,7 +679,7 @@ cdef class _pyustr(unicode): ustr constructor, similarly to the one in bstr, accepts arbitrary objects and stringify them. Please refer to bstr and u documentation for details. - See also: u, bstr/b. + See also: u, bstr/b, biter/uiter. """ # XXX due to "cannot `cdef class` with __new__" (https://github.com/cython/cython/issues/799) @@ -983,17 +984,43 @@ cdef PyObject* _pyustr_tp_new(PyTypeObject* _cls, PyObject* _argv, PyObject* _kw assert sizeof(_pyustr) == sizeof(PyUnicodeObject) -# _pyustrIter wraps unicode iterator to return pyustr for each yielded character. +# _pybstrIter wraps bytes iterator to return pybstr for each yielded byte. +cdef class _pybstrIter: + cdef object zbiter + def __init__(self, zbiter): + self.zbiter = zbiter + def __iter__(self): + return self + def __next__(self): + x = next(self.zbiter) + if PY_MAJOR_VERSION >= 3: + return pybbyte(x) + else: + return pyb(x) + +# _pyustrIter wraps zunicode iterator to return pyustr for each yielded character. cdef class _pyustrIter: - cdef object uiter - def __init__(self, uiter): - self.uiter = uiter + cdef object zuiter + def __init__(self, zuiter): + self.zuiter = zuiter def __iter__(self): return self def __next__(self): - x = next(self.uiter) + x = next(self.zuiter) return pyu(x) + +def pybiter(obj): + """biter(obj) is like iter(b(obj)) but TODO: iterates object incrementally + without doing full convertion to bstr.""" + return iter(pyb(obj)) # TODO iterate obj directly + +def pyuiter(obj): + """uiter(obj) is like iter(u(obj)) but TODO: iterates object incrementally + without doing full convertion to ustr.""" + return iter(pyu(obj)) # TODO iterate obj directly + + # _pyustrTranslateTab wraps table for .translate to return bstr as unicode # because unicode.translate does not accept bstr values. cdef class _pyustrTranslateTab: diff --git a/golang/golang_str_test.py b/golang/golang_str_test.py index ea7456b..0c5df23 100644 --- a/golang/golang_str_test.py +++ b/golang/golang_str_test.py @@ -21,7 +21,7 @@ from __future__ import print_function, absolute_import import golang -from golang import b, u, bstr, ustr, bbyte, uchr, func, defer, panic +from golang import b, u, bstr, ustr, biter, uiter, bbyte, uchr, func, defer, panic from golang._golang import _udata, _bdata from golang.gcompat import qq from golang.strconv_test import byterange @@ -617,35 +617,38 @@ def test_strings_index2(): # verify strings iteration. def test_strings_iter(): + # iter(u/unicode) + uiter(*) -> iterate unicode characters + # iter(b/bytes) + biter(*) -> iterate byte characters us = u("миру мир"); u_ = u"миру мир" - bs = b("миру мир") - - # iter( b/u/unicode ) -> iterate unicode characters - # NOTE that iter(b) too yields unicode characters - not integers or bytes - #bi = iter(bs) # XXX temp disabled - bi = iter(us) - ui = iter(us) - ui_ = iter(u_) + bs = b("миру мир"); b_ = xbytes("миру мир"); a_ = xbytearray(b_) + + # XIter verifies that going through all given iterators produces the same type and results. + missing=object() class XIter: + def __init__(self, typok, *viter): + self.typok = typok + self.viter = viter def __iter__(self): return self - def __next__(self, missing=object): - x = next(bi, missing) - y = next(ui, missing) - z = next(ui_, missing) - assert type(x) is type(y) - if x is not missing: - assert type(x) is ustr - if z is not missing: - assert type(z) is unicode - assert x == y - assert y == z - if x is missing: + def __next__(self): + vnext = [] + for it in self.viter: + obj = next(it, missing) + vnext.append(obj) + if missing in vnext: + assert vnext == [missing]*len(self.viter) raise StopIteration - return x + for obj in vnext: + assert type(obj) is self.typok + assert obj == vnext[0] + return vnext[0] next = __next__ # py2 - assert list(XIter()) == ['м','и','р','у',' ','м','и','р'] + assert list(XIter(ustr, iter(us), uiter(us), uiter(u_), uiter(bs), uiter(b_), uiter(a_))) == \ + ['м','и','р','у',' ','м','и','р'] + assert list(XIter(bstr, iter(bs), biter(us), biter(u_), biter(bs), biter(b_), biter(a_))) == \ + [b'\xd0',b'\xbc',b'\xd0',b'\xb8',b'\xd1',b'\x80',b'\xd1',b'\x83',b' ', + b'\xd0',b'\xbc',b'\xd0',b'\xb8',b'\xd1',b'\x80'] # verify .encode/.decode . diff --git a/gpython/gpython_test.py b/gpython/gpython_test.py index 85b97fb..a4775b1 100644 --- a/gpython/gpython_test.py +++ b/gpython/gpython_test.py @@ -87,6 +87,8 @@ def test_golang_builtins(): assert u is golang.u assert bstr is golang.bstr assert ustr is golang.ustr + assert biter is golang.biter + assert uiter is golang.uiter assert bbyte is golang.bbyte assert uchr is golang.uchr From a341f7612402510ece95e02d308b7a88ef66cde0 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Wed, 8 May 2024 13:21:55 +0300 Subject: [PATCH 25/29] X golang_str: Fix bstr/ustr __eq__ and friends to return NotImplemented wrt non-string types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In 54c2a3cf (golang_str: Teach bstr/ustr to compare wrt any string with automatic coercion) I've added __eq__, __ne__, __lt__ etc methods to our strings, but __lt__ and other comparison to raise TypeError against any non-string type. My idea was to mimic user-visible py3 behaviour such as >>> "abc" > 1 Traceback (most recent call last): File "", line 1, in TypeError: '>' not supported between instances of 'str' and 'int' However it turned out that the implementation was not exactly matching what Python is doing internally which lead to incorrect behaviour when bstr or ustr is compared wrt another type with its own __cmp__. In the general case for `a op b` Python first queries a.__op__(b) and b.__op'__(a) and sometimes other methods before going to .__cmp__. This relies on the methods to return NotImplemented instead of raising an exception and if a trial raises TypeError everything is stopped and that TypeError is returned to the caller. Jérome reports a real breakage due to this when bstr is compared wrt distutils.version.LooseVersion . LooseVersion is basically class LooseVersion(Version): def __cmp__ (self, other): if isinstance(other, StringType): other = LooseVersion(other) return cmp(self.version, other.version) but due to my thinko on `LooseVersion < bstr` the control flow was not getting into that LooseVersion.__cmp__ because bstr.__gt__ was tried first and raised TypeError. -> Fix all comparison operations to return NotImplemented instead of raising TypeError and make sure in the tests that this behaviour exactly matches what native str type does. The fix is needed not only for py2 because added test_strings_cmp_wrt_distutils_LooseVersion was failing on py3 as well without the fix. /reported-by @jerome /reported-on https://lab.nexedi.com/nexedi/slapos/-/merge_requests/1575#note_206080 --- golang/_golang_str.pyx | 86 ++++++++++++++++++++++++++------ golang/golang_str_test.py | 102 +++++++++++++++++++++++++++++++++----- 2 files changed, 160 insertions(+), 28 deletions(-) diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index 137e302..ac6233d 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -383,19 +383,48 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 return zbytes.__hash__(self) # == != < > <= >= - # NOTE == and != are special: they must succeed against any type so that - # bstr could be used as dict key. + # NOTE all operations must succeed against any type so that bstr could be + # used as dict key and arbitrary three-way comparisons, done by python, + # work correctly. This means that on py2 e.g. `bstr > int` will behave + # exactly as builtin str and won't raise TypeError. On py3 TypeError is + # raised for such operations by python itself when it receives + # NotImplemented from all tried methods. def __eq__(a, b): try: b = _pyb_coerce(b) except TypeError: - return False + return NotImplemented return zbytes.__eq__(a, b) - def __ne__(a, b): return not a.__eq__(b) - def __lt__(a, b): return zbytes.__lt__(a, _pyb_coerce(b)) - def __gt__(a, b): return zbytes.__gt__(a, _pyb_coerce(b)) - def __le__(a, b): return zbytes.__le__(a, _pyb_coerce(b)) - def __ge__(a, b): return zbytes.__ge__(a, _pyb_coerce(b)) + def __ne__(a, b): + try: + b = _pyb_coerce(b) + except TypeError: + return NotImplemented + return zbytes.__ne__(a, b) + def __lt__(a, b): + try: + b = _pyb_coerce(b) + except TypeError: + return NotImplemented + return zbytes.__lt__(a, _pyb_coerce(b)) + def __gt__(a, b): + try: + b = _pyb_coerce(b) + except TypeError: + return NotImplemented + return zbytes.__gt__(a, _pyb_coerce(b)) + def __le__(a, b): + try: + b = _pyb_coerce(b) + except TypeError: + return NotImplemented + return zbytes.__le__(a, _pyb_coerce(b)) + def __ge__(a, b): + try: + b = _pyb_coerce(b) + except TypeError: + return NotImplemented + return zbytes.__ge__(a, _pyb_coerce(b)) # len - no need to override @@ -724,19 +753,44 @@ cdef class _pyustr(unicode): return hash(pyb(self)) # == != < > <= >= - # NOTE == and != are special: they must succeed against any type so that - # ustr could be used as dict key. + # NOTE all operations must succeed against any type. + # See bstr for details. def __eq__(a, b): try: b = _pyu_coerce(b) except TypeError: - return False + return NotImplemented return zunicode.__eq__(a, b) - def __ne__(a, b): return not a.__eq__(b) - def __lt__(a, b): return zunicode.__lt__(a, _pyu_coerce(b)) - def __gt__(a, b): return zunicode.__gt__(a, _pyu_coerce(b)) - def __le__(a, b): return zunicode.__le__(a, _pyu_coerce(b)) - def __ge__(a, b): return zunicode.__ge__(a, _pyu_coerce(b)) + def __ne__(a, b): + try: + b = _pyu_coerce(b) + except TypeError: + return NotImplemented + return zunicode.__ne__(a, b) + def __lt__(a, b): + try: + b = _pyu_coerce(b) + except TypeError: + return NotImplemented + return zunicode.__lt__(a, _pyu_coerce(b)) + def __gt__(a, b): + try: + b = _pyu_coerce(b) + except TypeError: + return NotImplemented + return zunicode.__gt__(a, _pyu_coerce(b)) + def __le__(a, b): + try: + b = _pyu_coerce(b) + except TypeError: + return NotImplemented + return zunicode.__le__(a, _pyu_coerce(b)) + def __ge__(a, b): + try: + b = _pyu_coerce(b) + except TypeError: + return NotImplemented + return zunicode.__ge__(a, _pyu_coerce(b)) # len - no need to override diff --git a/golang/golang_str_test.py b/golang/golang_str_test.py index 0c5df23..dfc2c92 100644 --- a/golang/golang_str_test.py +++ b/golang/golang_str_test.py @@ -942,10 +942,18 @@ def test_strings_ops2_bufreject(tx, ty): assert (x == y) is False # see test_strings_ops2_eq_any assert (x != y) is True - with raises(TypeError): x >= y - with raises(TypeError): x <= y - with raises(TypeError): x > y - with raises(TypeError): x < y + if six.PY3: + with raises(TypeError): "abc" >= y # x.__op__(y) and y.__op'__(x) both return + with raises(TypeError): x >= y # NotImplemented which leads py3 to raise TypeError + with raises(TypeError): x <= y + with raises(TypeError): x > y + with raises(TypeError): x < y + else: + "abc" >= y # does not raise but undefined + x >= y # ----//---- + x <= y + x > y + x < y # reverse operations, e.g. memoryview + bstr with raises(TypeError): y + x @@ -959,10 +967,18 @@ def test_strings_ops2_bufreject(tx, ty): y == x # not raises TypeError - see test_strings_ops2_eq_any y != x # if tx is not bstr: - with raises(TypeError): y >= x - with raises(TypeError): y <= x - with raises(TypeError): y > x - with raises(TypeError): y < x + if six.PY3: + with raises(TypeError): y >= "abc" # see ^^^ + with raises(TypeError): y >= x + with raises(TypeError): y <= x + with raises(TypeError): y > x + with raises(TypeError): y < x + else: + y >= "abc" + y >= x + y <= x + y > x + y < x # verify string operations like `x == *` for x being bstr/ustr. @@ -982,10 +998,19 @@ def test_strings_ops2_eq_any(tx): def assertNE(y): assert (x == y) is False assert (x != y) is True - with raises(TypeError): x >= y - with raises(TypeError): x <= y - with raises(TypeError): x > y - with raises(TypeError): x < y + if six.PY3: + with raises(TypeError): "abc" >= y # py3: NotImplemented -> raise + with raises(TypeError): x >= y + with raises(TypeError): x <= y + with raises(TypeError): x > y + with raises(TypeError): x < y + else: + "abc" >= y # py2: no raise on NotImplemented; result is undefined + x >= y + x <= y + x > y + x < y + _ = assertNE _(None) @@ -1009,6 +1034,21 @@ def assertNE(y): with raises(TypeError): hash(l) _(l) + # also verify that internally x.__op__(y of non-string-type) returns + # NotImplemented - exactly the same way as builtin str type does. Even + # though `x op y` gives proper answer internally python counts on x.__op__(y) + # to return NotImplemented so that arbitrary three-way comparison works properly. + s = xstr(u'мир', str) + for op in ('eq', 'ne', 'lt', 'gt', 'le', 'ge'): + sop = getattr(s, '__%s__' % op) + xop = getattr(x, '__%s__' % op) + assert sop(None) is NotImplemented + assert xop(None) is NotImplemented + assert sop(0) is NotImplemented + assert xop(0) is NotImplemented + assert sop(hx) is NotImplemented + assert xop(hx) is NotImplemented + # verify logic in `bstr % ...` and `bstr.format(...)` . def test_strings_mod_and_format(): @@ -2624,6 +2664,44 @@ def tests_strings_early_str_subclass(): # XXX more... +# ---- issues hit by users ---- +# fixes for below issues have their corresponding tests in the main part above, but +# we also add tests with original code where problems were hit. + +# three-way comparison wrt class with __cmp__ was working incorrectly because +# bstr.__op__ were not returning NotImplemented wrt non-string types. +# https://lab.nexedi.com/nexedi/slapos/-/merge_requests/1575#note_206080 +@mark.parametrize('tx', (str, bstr if str is bytes else ustr)) # LooseVersion does not handle unicode on py2 +def test_strings_cmp_wrt_distutils_LooseVersion(tx): + from distutils.version import LooseVersion + + l = LooseVersion('1.16.2') + + x = xstr('1.12', tx) + assert not (x == l) + assert not (l == x) + assert x != l + assert l != x + assert not (x >= l) + assert l >= x + assert x <= l + assert not (l <= x) + assert x < l + assert not (l < x) + + x = xstr('1.16.2', tx) + assert x == l + assert l == x + assert not (x != l) + assert not (l != x) + assert x >= l + assert l >= x + assert x <= l + assert l <= x + assert not (x < l) + assert not (l < x) + + # ---- benchmarks ---- # utf-8 decoding From 84ed3e79b9ceb72c3c7fe4505c2ef1a4af747403 Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Wed, 8 May 2024 16:24:38 +0300 Subject: [PATCH 26/29] X golang_str: More fixes for bstr to be accepted as name of an attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This time we hit that builtin getattr was rejecting it. Fix it via patching _PyObject_LookupAttr, that builtin getattr uses, and by adding tests for this functionality. Reported by Jérome at https://lab.nexedi.com/nexedi/slapos/-/merge_requests/1575#note_206080 --- golang/.gitignore | 1 + golang/_golang_str.pyx | 57 +++++++++++++++++++++++++++-------- golang/_golang_str_test.pyx | 47 +++++++++++++++++++++++++++++ golang/_golang_test.pyx | 5 ++++ golang/golang_str_test.py | 59 ++++++++++++++++++++++++++++++++++++- setup.py | 4 ++- 6 files changed, 158 insertions(+), 15 deletions(-) create mode 100644 golang/_golang_str_test.pyx diff --git a/golang/.gitignore b/golang/.gitignore index 7492664..e532690 100644 --- a/golang/.gitignore +++ b/golang/.gitignore @@ -6,6 +6,7 @@ /_fmt_test.cpp /_golang.cpp /_golang_test.cpp +/_golang_str_test.cpp /_io.cpp /_os.cpp /_os_test.cpp diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index ac6233d..890ab9a 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -36,7 +36,8 @@ from cpython cimport PyObject_CheckBuffer from cpython cimport Py_TPFLAGS_HAVE_GC, Py_TPFLAGS_HEAPTYPE, Py_TPFLAGS_READY, PyType_Ready from cpython cimport Py_TPFLAGS_VALID_VERSION_TAG from cpython cimport PyBytes_Format, PyUnicode_Format, PyObject_Str -from cpython cimport PyObject_GetAttr, PyObject_SetAttr +from cpython cimport PyObject_GetAttr, PyObject_SetAttr, PyObject_HasAttr +from cpython cimport PyBytes_Check cdef extern from "Python.h": PyTypeObject PyBytes_Type @@ -1997,34 +1998,64 @@ cdef _patch_capi_object_str(): # XXX place, comments, test # on py3 PyObject_GetAttr & co insist on name to be unicode -# XXX _PyObject_LookupAttr # XXX _PyObject_GenericGetAttrWithDict # XXX _PyObject_GenericSetAttrWithDict # XXX type_getattro IF PY3: + cdef extern from "Python.h": + int _PyObject_LookupAttr(object obj, object attr, PyObject** pres) except -1 + ctypedef object obj_getattr_func(object, object) ctypedef int obj_setattr_func(object, object, object) except -1 - - cdef obj_getattr_func* _pobject_GetAttr = PyObject_GetAttr - cdef obj_setattr_func* _pobject_SetAttr = PyObject_SetAttr + # delattr is implemented via setattr(v=NULL) + ctypedef bint obj_hasattr_func(object, object) # no except + ctypedef int obj_lookupattr_func(object, object, PyObject**) except -1 + + cdef obj_getattr_func* _pobject_GetAttr = PyObject_GetAttr + cdef obj_setattr_func* _pobject_SetAttr = PyObject_SetAttr + cdef obj_hasattr_func* _pobject_HasAttr = PyObject_HasAttr + cdef obj_lookupattr_func* _pobject_LookupAttr = _PyObject_LookupAttr + + # isbstr returns whether obj is bstr instance or not. + # it avoids going to isinstance unless really needed because isinstance, + # internally, uses _PyObject_LookupAttr and we need to patch that function + # with using isbstr in the hook. + cdef bint isbstr(obj) except -1: + if not PyBytes_Check(obj): + return False + if Py_TYPE(obj) == pybstr: + return True + # it might be also a pybstr subclass + return isinstance(obj, pybstr) cdef object _object_xGetAttr(object obj, object name): -# fprintf(stderr, "xgetattr...\n") - if isinstance(name, pybstr): + if isbstr(name): name = pyustr(name) return _pobject_GetAttr(obj, name) - cdef int _object_xSetAttr(object obj, object name, object v) except -1: -# fprintf(stderr, "xsetattr...\n") - if isinstance(name, pybstr): + cdef int _object_xSetAttr(object obj, object name, object v) except -1: # XXX v=NULL on del + if isbstr(name): name = pyustr(name) return _pobject_SetAttr(obj, name, v) + cdef bint _object_xHasAttr(object obj, object name): # no except + if isbstr(name): + name = pyustr(name) + return _pobject_HasAttr(obj, name) + + + cdef int _object_xLookupAttr(object obj, object name, PyObject** pres) except -1: + if isbstr(name): + name = pyustr(name) + return _pobject_LookupAttr(obj, name, pres) + cdef _patch_capi_object_attr_bstr(): IF PY3: - cpatch(&_pobject_GetAttr, _object_xGetAttr) - cpatch(&_pobject_SetAttr, _object_xSetAttr) + cpatch(&_pobject_GetAttr, _object_xGetAttr) + cpatch(&_pobject_SetAttr, _object_xSetAttr) + cpatch(&_pobject_HasAttr, _object_xHasAttr) + cpatch(&_pobject_LookupAttr, _object_xLookupAttr) # ---- misc ---- @@ -2397,7 +2428,7 @@ cdef _patch_str(): _patch_capi_str_format() _patch_capi_object_str() - _patch_capi_object_attr_bstr() + _patch_capi_object_attr_bstr() # XXX activate under plain py as well _patch_capi_unicode_decode_as_bstr() _patch_str_pickle() # ... diff --git a/golang/_golang_str_test.pyx b/golang/_golang_str_test.pyx new file mode 100644 index 0000000..906a532 --- /dev/null +++ b/golang/_golang_str_test.pyx @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# cython: language_level=2 +# distutils: language=c++ +# +# Copyright (C) 2024 Nexedi SA and Contributors. +# Kirill Smelkov +# +# This program is free software: you can Use, Study, Modify and Redistribute +# it under the terms of the GNU General Public License version 3, or (at your +# option) any later version, as published by the Free Software Foundation. +# +# You can also Link and Combine this program with other software covered by +# the terms of any of the Free Software licenses or any of the Open Source +# Initiative approved licenses and Convey the resulting work. Corresponding +# source of such a combination shall include the source code for all other +# software used. +# +# This program is distributed WITHOUT ANY WARRANTY; without even the implied +# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# See COPYING file for full licensing terms. +# See https://www.nexedi.com/licensing for rationale and options. + +# helpers for golang_str_test.py that need C-level access. + +from cpython cimport PyObject_GetAttr, PyObject_SetAttr, PyObject_DelAttr, PyObject_HasAttr + +def CPyObject_GetAttr(obj, attr): return PyObject_GetAttr(obj, attr) +def CPyObject_SetAttr(obj, attr, v): PyObject_SetAttr(obj, attr, v) +def CPyObject_DelAttr(obj, attr): PyObject_DelAttr(obj, attr) +def CPyObject_HasAttr(obj, attr): return PyObject_HasAttr(obj, attr) + + +IF PY3: + cdef extern from "Python.h": + int _PyObject_LookupAttr(object obj, object attr, PyObject** pres) except -1 + + def CPyObject_LookupAttr(obj, attr): + cdef PyObject* res + _PyObject_LookupAttr(obj, attr, &res) + if res == NULL: + raise AttributeError((obj, attr)) + return res + +# XXX +more capi func +#def CPyObject_GenericGetAttr(obj, attr): return PyObject_GenericGetAttr(obj, attr) +#def CPyObject_GenericSetAttr(obj, attr, v): PyObject_GenericSetAttr(obj, attr, v) diff --git a/golang/_golang_test.pyx b/golang/_golang_test.pyx index 3c9f60e..11fe7c3 100644 --- a/golang/_golang_test.pyx +++ b/golang/_golang_test.pyx @@ -435,3 +435,8 @@ cdef void _bench_select_nogil__func1(chan[int] ch1, chan[int] ch2, chan[structZ] if not ok: done.close() return + + +# ---- strings ----- + +include "_golang_str_test.pyx" diff --git a/golang/golang_str_test.py b/golang/golang_str_test.py index dfc2c92..9eac585 100644 --- a/golang/golang_str_test.py +++ b/golang/golang_str_test.py @@ -26,6 +26,7 @@ from golang.gcompat import qq from golang.strconv_test import byterange from golang.golang_test import readfile, assertDoc, _pyrun, dir_testprog, PIPE +from golang import _golang_test from gpython import _tEarlyStrSubclass from pytest import raises, mark, skip import sys @@ -2631,7 +2632,7 @@ def _(delta): # # XXX note !gpystr_only ... # XXX also test bytes? -def tests_strings_early_str_subclass(): +def test_strings_early_str_subclass(): xstr = _tEarlyStrSubclass # .tp_new should be adjusted to point to current str @@ -2664,6 +2665,62 @@ def tests_strings_early_str_subclass(): # XXX more... +# verify that all string types are accepted by getattr/setattr/delattr/hasattr & co. +@mark.parametrize('tx', (str, bstr, ustr)) +def test_strings_wrt_xxxattr(tx): + x = xstr(u'мир', tx) + assert type(x) is tx + + class C: pass + obj = C() + + t = _golang_test + vgetattr = [getattr, t.CPyObject_GetAttr] + [t.CPyObject_LookupAttr] if six.PY3 else [] + vsetattr = [setattr, t.CPyObject_SetAttr] + vdelattr = [delattr, t.CPyObject_DelAttr] + vhasattr = [hasattr, t.CPyObject_HasAttr] + + value = object() + + # run runs f on each element of v. + def run(f, v): + for e in v: + f(e) + + # attr is initially missing + def _(ga): + with raises(AttributeError): ga(obj, x) + run(_, vgetattr) + + def _(ha): + assert ha(obj, x) is False + run(_, vhasattr) + + def _(da): + with raises(AttributeError): da(obj, x) + run(_, vdelattr) + + # set attr -> make sure it is there -> del + for sa in vsetattr: + for da in vdelattr: + def _(ha): + assert ha(obj, x) is False + run(_, vhasattr) + sa(obj, x, value) + def _(ha): + assert ha(obj, x) is True + run(_, vhasattr) + def _(ga): + assert ga(obj, x) is value + da(obj, x) + def _(ha): + assert ha(obj, x) is False + run(_, vhasattr) + def _(ga): + with raises(AttributeError): ga(obj, x) + run(_, vgetattr) + + # ---- issues hit by users ---- # fixes for below issues have their corresponding tests in the main part above, but # we also add tests with original code where problems were hit. diff --git a/setup.py b/setup.py index 8d675b8..db7910e 100644 --- a/setup.py +++ b/setup.py @@ -510,7 +510,9 @@ def defif(name, ok): Ext('golang._golang_test', ['golang/_golang_test.pyx', 'golang/runtime/libgolang_test_c.c', - 'golang/runtime/libgolang_test.cpp']), + 'golang/runtime/libgolang_test.cpp'], + depends = [ + 'golang/_golang_str_test.pyx']), Ext('golang.pyx._runtime_test', ['golang/pyx/_runtime_test.pyx'], From a69d44dda5c83b8bd8664f36c9e7b775fed6af3a Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Wed, 8 May 2024 17:30:37 +0300 Subject: [PATCH 27/29] fixup! X golang_str: More fixes for bstr to be accepted as name of an attribute Contrary to py3.11, py3.9 also explicitly checks for unicode inside builtin getattr. -> Patch that explicitly as well. --- golang/_golang_str.pyx | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index 890ab9a..94bcad3 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -2057,6 +2057,36 @@ cdef _patch_capi_object_attr_bstr(): cpatch(&_pobject_HasAttr, _object_xHasAttr) cpatch(&_pobject_LookupAttr, _object_xLookupAttr) + # py3 < 3.11 also verifies name to be unicode + # XXX move out of _patch_capi* ? + import builtins + cdef object builtins_getattr = builtins.getattr + cdef object builtins_setattr = builtins.setattr + cdef object builtins_delattr = builtins.delattr + cdef object builtins_hasattr = builtins.hasattr + + def xgetattr(obj, name, *argv): + if isbstr(name): + name = pyustr(name) + return builtins_getattr(obj, name, *argv) + def xsetattr(obj, name, value): + if isbstr(name): + name = pyustr(name) + return builtins_setattr(obj, name, value) + def xdelattr(obj, name): + if isbstr(name): + name = pyustr(name) + return builtins_delattr(obj, name) + def xhasattr(obj, name): + if isbstr(name): + name = pyustr(name) + return builtins_hasattr(obj, name) + + builtins.getattr = xgetattr + builtins.setattr = xsetattr + builtins.delattr = xdelattr + builtins.hasattr = xhasattr + # ---- misc ---- From abf3dcec0309f0072093ec769158a16f668247cc Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Fri, 10 May 2024 12:22:23 +0300 Subject: [PATCH 28/29] X golang_str: Fix bstr/ustr __add__ and friends to return NotImplemented wrt unsupported types In bbbb58f0 (golang_str: bstr/ustr support for + and *) I've added support for binary string operations, but similarly to __eq__ did not handle correctly the case for arbitrary arguments that potentially define __radd__ and similar. As the result it breaks when running e.g. bstr + pyparsing.Regex File ".../pyparsing-2.4.7-py2.7.egg/pyparsing.py", line 6591, in pyparsing_common _full_ipv6_address = (_ipv6_part + (':' + _ipv6_part) * 7).setName("full IPv6 address") File "golang/_golang_str.pyx", line 469, in golang._golang._pybstr.__add__ return pyb(zbytes.__add__(a, _pyb_coerce(b))) File "golang/_golang_str.pyx", line 243, in golang._golang._pyb_coerce raise TypeError("b: coerce: invalid type %s" % type(x)) TypeError: b: coerce: invalid type because pyparsing.Regex is a type, that does not inherit from str, but defines its own __radd__ to handle str + Regex as Regex. -> Fix it by returning NotImplemented from under __add__ and other operations where it is needed so that bstr and ustr behave in the same way as builtin str wrt third types, but care to handle bstr/ustr promise that only explicit conversion through `b` and `u` accept objects with buffer interface. Automatic coercion does not. --- golang/_golang_str.pyx | 28 ++++++++++++++++++--- golang/golang_str_test.py | 51 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 4 deletions(-) diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index 94bcad3..b6d8640 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -466,7 +466,13 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 if type(a) is not pybstr: assert type(b) is pybstr return b.__radd__(a) - return pyb(zbytes.__add__(a, _pyb_coerce(b))) + try: + b = _pyb_coerce(b) + except TypeError: + if not hasattr(b, '__radd__'): + raise # don't let python to handle e.g. bstr + memoryview automatically + return NotImplemented + return pyb(zbytes.__add__(a, b)) def __radd__(b, a): # a.__add__(b) returned NotImplementedError, e.g. for unicode.__add__(bstr) @@ -484,7 +490,11 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 if type(a) is not pybstr: assert type(b) is pybstr return b.__rmul__(a) - return pyb(zbytes.__mul__(a, b)) + try: + _ = zbytes.__mul__(a, b) + except TypeError: # TypeError: `b` cannot be interpreted as an integer + return NotImplemented + return pyb(_) def __rmul__(b, a): return b.__mul__(a) @@ -821,7 +831,13 @@ cdef class _pyustr(unicode): if type(a) is not pyustr: assert type(b) is pyustr, type(b) return b.__radd__(a) - return pyu(zunicode.__add__(a, _pyu_coerce(b))) + try: + b = _pyu_coerce(b) + except TypeError: + if not hasattr(b, '__radd__'): + raise # don't let py2 to handle e.g. unicode + buffer automatically + return NotImplemented + return pyu(zunicode.__add__(a, b)) def __radd__(b, a): # a.__add__(b) returned NotImplementedError, e.g. for unicode.__add__(bstr) @@ -841,7 +857,11 @@ cdef class _pyustr(unicode): if type(a) is not pyustr: assert type(b) is pyustr, type(b) return b.__rmul__(a) - return pyu(zunicode.__mul__(a, b)) + try: + _ = zunicode.__mul__(a, b) + except TypeError: # TypeError: `b` cannot be interpreted as an integer + return NotImplemented + return pyu(_) def __rmul__(b, a): return b.__mul__(a) diff --git a/golang/golang_str_test.py b/golang/golang_str_test.py index 9eac585..f9ae6c7 100644 --- a/golang/golang_str_test.py +++ b/golang/golang_str_test.py @@ -982,6 +982,57 @@ def test_strings_ops2_bufreject(tx, ty): y < x +# verify string operations like `x + y` for x being str/bstr/ustr and y being +# arbitrary type that defines __rop__. +@mark.parametrize('tx', (str, bstr, ustr)) +def test_strings_ops2_rop_any(tx): + # ROp(rop, x, y) represents call to y.__rop__(x) + class ROp: + def __init__(r, rop, x, y): + r.rop, r.x, r.y = rop, x, y + def __repr__(r): + return 'ROp(%r, %r, %r)' % (r.rop, r.x, r.y) + def __eq__(a, b): + return isinstance(b, ROp) and a.rop == b.rop and a.x is b.x and a.y is b.y + def __ne__(a, b): + return not (a == b) + + class C: + def __radd__(b, a): return ROp('radd', a, b) + def __rsub__(b, a): return ROp('rsub', a, b) + def __rmul__(b, a): return ROp('rmul', a, b) + def __rdiv__(b, a): return ROp('rdiv', a, b) + def __rtruediv__(b, a): return ROp('rtruediv', a, b) + def __rfloordiv__(b, a): return ROp('rfloordiv', a, b) + def __rmod__(b, a): return ROp('rmod', a, b) + def __rdivmod__(b, a): return ROp('rdivmod', a, b) + def __rpow__(b, a): return ROp('rpow', a, b) + def __rlshift__(b, a): return ROp('rlshift', a, b) + def __rrshift__(b, a): return ROp('rrshift', a, b) + def __rand__(b, a): return ROp('rand', a, b) + def __rxor__(b, a): return ROp('rxor', a, b) + def __ror__(b, a): return ROp('ror', a, b) + + + x = xstr(u'мир', tx) + y = C() + R = lambda rop: ROp(rop, x, y) + + assert x + y == R('radd') + assert x - y == R('rsub') + assert x * y == R('rmul') + assert x / y == R(x32('rtruediv', 'rdiv')) + assert x // y == R('rfloordiv') + # x % y is always handled by str and verified in test_strings_mod_and_format + assert divmod(x,y) == R('rdivmod') + assert x ** y == R('rpow') + assert x << y == R('rlshift') + assert x >> y == R('rrshift') + assert x & y == R('rand') + assert x ^ y == R('rxor') + assert x | y == R('ror') + + # verify string operations like `x == *` for x being bstr/ustr. # Those operations must succeed for any hashable type or else bstr/ustr could # not be used as dict keys. From 93e9c25a6f527a5aa26cab92f3c0cdc57271b68d Mon Sep 17 00:00:00 2001 From: Kirill Smelkov Date: Fri, 10 May 2024 17:14:38 +0300 Subject: [PATCH 29/29] X golang_str: Add ustr.decode for symmetry with bstr.decode and because gpy2 breaks without it Without working unicode.decode gpy2 fails when running ERP5 as follows: $ /srv/slapgrid/slappart49/t/ekg/i/5/bin/runTestSuite --help No handlers could be found for logger "SecurityInfo" Traceback (most recent call last): File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/bin/.runTestSuite.pyexe", line 296, in main() File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 484, in main pymain(argv, init) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 292, in pymain run(mmain) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 192, in run _execfile(filepath, mmain.__dict__) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/pygolang/gpython/__init__.py", line 339, in _execfile six.exec_(code, globals, locals) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/eggs/six-1.16.0-py2.7.egg/six.py", line 735, in exec_ exec("""exec _code_ in _globs_, _locs_""") File "", line 1, in File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/bin/runTestSuite", line 10, in from Products.ERP5Type.tests.runTestSuite import main; sys.exit(main()) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/erp5/product/ERP5Type/__init__.py", line 96, in from . import ZopePatch File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/erp5/product/ERP5Type/ZopePatch.py", line 75, in from Products.ERP5Type.patches import ZopePageTemplateUtils File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/parts/erp5/product/ERP5Type/patches/ZopePageTemplateUtils.py", line 58, in convertToUnicode(u'', 'text/xml', ()) File "/srv/slapgrid/slappart49/t/ekg/soft/b5048b47894a7612651c7fe81c2c8636/eggs/Zope-4.8.9+slapospatched002-py2.7.egg/Products/PageTemplates/utils.py", line 73, in convertToUnicode return source.decode(encoding), encoding AttributeError: unreadable attribute and in general if we treat both bstr ans ustr being two different representations of the same entity, if we have bstr.decode, having ustr.decode is also needed for symmetry with both operations converting bytes representation of the string into unicode. Now there is full symmetry in between bstr/ustr and encode/decode. Quoting updated encode/decode text: Encode encodes unicode representation of the string into bytes, leaving string domain. Decode decodes bytes representation of the string into ustr, staying inside string domain. Both bstr and ustr are accepted by encode and decode treating them as two different representations of the same entity. On encoding, for bstr, the string representation is first converted to unicode and encoded to bytes from there. For ustr unicode representation of the string is directly encoded. On decoding, for ustr, the string representation is first converted to bytes and decoded to unicode from there. For bstr bytes representation of the string is directly decoded. --- golang/_golang_str.pyx | 61 +++++++++++++++------- golang/golang_str_test.py | 106 +++++++++++++++++++++----------------- 2 files changed, 100 insertions(+), 67 deletions(-) diff --git a/golang/_golang_str.pyx b/golang/_golang_str.pyx index b6d8640..0e43f48 100644 --- a/golang/_golang_str.pyx +++ b/golang/_golang_str.pyx @@ -528,25 +528,31 @@ cdef class _pybstr(bytes): # https://github.com/cython/cython/issues/711 # encode/decode # - # Encoding strings - both bstr and ustr - convert type to bytes leaving string domain. + # Encode encodes unicode representation of the string into bytes, leaving string domain. + # Decode decodes bytes representation of the string into ustr, staying inside string domain. # - # Encode treats bstr and ustr as string, encoding unicode representation of - # the string to bytes. For bstr it means that the string representation is - # first converted to unicode and encoded to bytes from there. For ustr - # unicode representation of the string is directly encoded. + # Both bstr and ustr are accepted by encode and decode treating them as two + # different representations of the same entity. # - # Decoding strings is not provided. However for bstr the decode is provided - # treating input data as raw bytes and producing ustr as the result. + # On encoding, for bstr, the string representation is first converted to + # unicode and encoded to bytes from there. For ustr unicode representation + # of the string is directly encoded. + # + # On decoding, for ustr, the string representation is first converted to + # bytes and decoded to unicode from there. For bstr bytes representation of + # the string is directly decoded. # # NOTE __bytes__ and encode are the only operations that leave string domain. def encode(self, encoding=None, errors=None): # -> bytes encoding, errors = _encoding_with_defaults(encoding, errors) + if encoding == 'utf-8' and errors == 'surrogateescape': + return _bdata(self) + # on py2 e.g. bytes.encode('string-escape') works on bytes directly if PY_MAJOR_VERSION < 3: - codec = pycodecs.lookup(encoding) - if not codec._is_text_encoding or \ - encoding in ('string-escape',): # string-escape also works on bytes + codec = _pycodecs_lookup_binary(encoding) + if codec is not None: return codec.encode(self, errors)[0] return pyu(self).encode(encoding, errors) @@ -894,15 +900,23 @@ cdef class _pyustr(unicode): encoding, errors = _encoding_with_defaults(encoding, errors) if encoding == 'utf-8' and errors == 'surrogateescape': - x = _utf8_encode_surrogateescape(self) - else: - x = zunicode.encode(self, encoding, errors) - return x + return _utf8_encode_surrogateescape(self) - if PY_MAJOR_VERSION < 3: - # whiteout decode inherited from unicode - # TODO ideally whiteout it in such a way that ustr.decode also raises AttributeError - decode = property(doc='ustr has no decode') + # on py2 e.g. 'string-escape' works on bytes + if PY_MAJOR_VERSION < 3: + codec = _pycodecs_lookup_binary(encoding) + if codec is not None: + return codec.encode(pyb(self), errors)[0] + + return zunicode.encode(self, encoding, errors) + + def decode(self, encoding=None, errors=None): # -> ustr | bstr for encodings like string-escape + encoding, errors = _encoding_with_defaults(encoding, errors) + + if encoding == 'utf-8' and errors == 'surrogateescape': + return pyu(self) + + return pyb(self).decode(encoding, errors) # all other string methods @@ -2161,6 +2175,15 @@ cdef extern from "Python.h": """ bint _XPyMapping_Check(object o) +# _pycodecs_lookup_binary returns codec corresponding to encoding if the codec works on binary input. +# example of such codecs are string-escape and hex encodings. +cdef _pycodecs_lookup_binary(encoding): # -> codec | None (text) | LookupError (no such encoding) + codec = pycodecs.lookup(encoding) + if not codec._is_text_encoding or \ + encoding in ('string-escape',): # string-escape also works on bytes + return codec + return None + # ---- UTF-8 encode/decode ---- @@ -2426,8 +2449,6 @@ cdef _patch_str(): # XXX explain bpreserve_slots = upreserve_slots = ("maketrans",) - if PY_MAJOR_VERSION < 3: - upreserve_slots += ("decode",) # patch unicode to be pyustr. This patches # - unicode (py2) diff --git a/golang/golang_str_test.py b/golang/golang_str_test.py index f9ae6c7..0896032 100644 --- a/golang/golang_str_test.py +++ b/golang/golang_str_test.py @@ -657,58 +657,61 @@ def test_strings_encodedecode(): us = u('мир') bs = b('май') - _ = us.encode(); assert type(_) is bytes; assert _ == xbytes('мир') - _ = us.encode('utf-8'); assert type(_) is bytes; assert _ == xbytes('мир') - _ = bs.encode(); assert type(_) is bytes; assert _ == xbytes('май') - _ = bs.encode('utf-8'); assert type(_) is bytes; assert _ == xbytes('май') - - # TODO also raise AttributeError on .encode/.decode lookup on classes - assert not hasattr(us, 'decode') #; assert not hasattr(ustr, 'decode') - _ = bs.decode(); assert type(_) is ustr; assert _udata(_) == u'май' - _ = bs.decode('utf-8'); assert type(_) is ustr; assert _udata(_) == u'май' + # encode does obj.encode and makes sure result type is bytes + def encode(obj, *argv): + _ = obj.encode(*argv) + assert type(_) is bytes + return _ + + # decode does obj.decode and makes sure result type is ustr + def decode(obj, *argv): + _ = obj.decode(*argv) + assert type(_) is ustr + return _ + + _ = encode(us); assert _ == xbytes('мир') + _ = encode(us, 'utf-8'); assert _ == xbytes('мир') + _ = encode(bs); assert _ == xbytes('май') + _ = encode(bs, 'utf-8'); assert _ == xbytes('май') + + _ = decode(us); assert _udata(_) == u'мир' + _ = decode(us, 'utf-8'); assert _udata(_) == u'мир' + _ = decode(bs); assert _udata(_) == u'май' + _ = decode(bs, 'utf-8'); assert _udata(_) == u'май' # !utf-8 - k8mir = u'мир'.encode('koi8-r') - b_k8mir = b(k8mir) - assert type(b_k8mir) is bstr - assert _bdata(b_k8mir) == k8mir - assert _bdata(b_k8mir) == b'\xcd\xc9\xd2' + k8mir = u'мир'.encode('koi8-r'); assert k8mir == b'\xcd\xc9\xd2' + b_k8mir = b(k8mir); assert type(b_k8mir) is bstr; assert _bdata(b_k8mir) == b'\xcd\xc9\xd2' + u_k8mir = u(k8mir); assert type(u_k8mir) is ustr; assert _udata(u_k8mir) == u'\udccd\udcc9\udcd2' - _ = b_k8mir.decode('koi8-r') - assert type(_) is ustr - assert _udata(_) == u'мир' + _ = decode(b_k8mir, 'koi8-r'); assert _udata(_) == u'мир' + _ = decode(u_k8mir, 'koi8-r'); assert _udata(_) == u'мир' - cpmir = us.encode('cp1251') - assert type(cpmir) is bytes - assert cpmir == u'мир'.encode('cp1251') - assert cpmir == b'\xec\xe8\xf0' + _ = encode(us, 'cp1251'); assert _ == u'мир'.encode('cp1251'); assert _ == b'\xec\xe8\xf0' + _ = encode(bs, 'cp1251'); assert _ == u'май'.encode('cp1251'); assert _ == b'\xec\xe0\xe9' # decode/encode errors - u_k8mir = b_k8mir.decode() # no decode error with - assert type(u_k8mir) is ustr # default parameters - assert _udata(u_k8mir) == u'\udccd\udcc9\udcd2' - _ = b_k8mir.decode('utf-8', 'surrogateescape') # no decode error with - assert type(_) is ustr # explicit utf-8/surrogateescape - assert _udata(_) == _udata(u_k8mir) - - with raises(UnicodeDecodeError): # decode error if encoding is explicitly specified - b_k8mir.decode('utf-8') - with raises(UnicodeDecodeError): - b_k8mir.decode('utf-8', 'strict') - with raises(UnicodeDecodeError): - b_k8mir.decode('ascii') - - with raises(UnicodeEncodeError): - us.encode('ascii') - - _ = u_k8mir.encode() # no encode error with - assert type(_) is bytes # default parameters - assert _ == k8mir - _ = u_k8mir.encode('utf-8', 'surrogateescape') # no encode error with - assert type(_) is bytes # explicit utf-8/surrogateescape + _ = decode(b_k8mir); assert _ == u_k8mir # no decode error with default parameters + _ = decode(b_k8mir, 'utf-8', 'surrogateescape') # or with explicit utf-8/surrogateescape + assert _ == u_k8mir + _ = decode(u_k8mir); assert _ == u_k8mir + _ = decode(u_k8mir, 'utf-8', 'surrogateescape'); assert _ == u_k8mir + + with raises(UnicodeDecodeError): b_k8mir.decode('utf-8') # decode error on unmatching explicit encoding + with raises(UnicodeDecodeError): u_k8mir.decode('utf-8') + with raises(UnicodeDecodeError): b_k8mir.decode('utf-8', 'strict') + with raises(UnicodeDecodeError): u_k8mir.decode('utf-8', 'strict') + with raises(UnicodeDecodeError): b_k8mir.decode('ascii') + with raises(UnicodeDecodeError): u_k8mir.decode('ascii') + + with raises(UnicodeEncodeError): us.encode('ascii') # encode error if target encoding cannot represent string + with raises(UnicodeEncodeError): bs.encode('ascii') + + _ = encode(u_k8mir); assert _ == k8mir # no encode error with default parameters + _ = encode(u_k8mir, 'utf-8', 'surrogateescape') # or with explicit utf-8/surrogateescape assert _ == k8mir - _ = b_k8mir.encode() # bstr.encode = bstr -> ustr -> encode - assert type(_) is bytes + _ = encode(b_k8mir); assert _ == k8mir # bstr.encode = bstr -> ustr -> encode + _ = encode(b_k8mir, 'utf-8', 'surrogateescape') assert _ == k8mir # on py2 unicode.encode accepts surrogate pairs and does not complain @@ -726,19 +729,28 @@ def test_strings_encodedecode(): # verify that this exact semantic is preserved if six.PY3: with raises(LookupError): bs.decode('hex') + with raises(LookupError): us.decode('hex') with raises(LookupError): bs.decode('string-escape') + with raises(LookupError): us.decode('string-escape') else: _ = bs.decode('string-escape'); assert type(_) is bstr; assert _ == bs + _ = us.decode('string-escape'); assert type(_) is bstr; assert _ == us _ = b(r'x\'y').decode('string-escape'); assert type(_) is bstr; assert _bdata(_) == b"x'y" + _ = u(r'x\'y').decode('string-escape'); assert type(_) is bstr; assert _bdata(_) == b"x'y" _ = b('616263').decode('hex'); assert type(_) is bstr; assert _bdata(_) == b"abc" + _ = u('616263').decode('hex'); assert type(_) is bstr; assert _bdata(_) == b"abc" # similarly for bytes.encode if six.PY3: with raises(LookupError): bs.encode('hex') + with raises(LookupError): us.encode('hex') with raises(LookupError): bs.encode('string-escape') + with raises(LookupError): us.encode('string-escape') else: - _ = bs.encode('hex'); assert type(_) is bytes; assert _ == b'd0bcd0b0d0b9' - _ = bs.encode('string-escape'); assert type(_) is bytes; assert _ == br'\xd0\xbc\xd0\xb0\xd0\xb9' + _ = encode(bs, 'hex'); assert _ == b'd0bcd0b0d0b9' + _ = encode(us, 'hex'); assert _ == b'd0bcd0b8d180' + _ = encode(bs, 'string-escape'); assert _ == br'\xd0\xbc\xd0\xb0\xd0\xb9' + _ = encode(us, 'string-escape'); assert _ == br'\xd0\xbc\xd0\xb8\xd1\x80' # verify string operations like `x * 3` for all cases from bytes, bytearray, unicode, bstr and ustr.