aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCarl Friedrich Bolz-Tereick <cfbolz@gmx.de>2021-03-02 20:37:27 +0100
committerCarl Friedrich Bolz-Tereick <cfbolz@gmx.de>2021-03-02 20:37:27 +0100
commitdf4810d86c2d8ae70a59a0c9f163c0f3fbc44ab2 (patch)
treef2f76762def2af41963672a4cc06774673d3fe69
parentadd ascii fast paths to the tolower/toupper functions of the unicode dbs too (diff)
downloadpypy-df4810d86c2d8ae70a59a0c9f163c0f3fbc44ab2.tar.gz
pypy-df4810d86c2d8ae70a59a0c9f163c0f3fbc44ab2.tar.bz2
pypy-df4810d86c2d8ae70a59a0c9f163c0f3fbc44ab2.zip
some ascii fast paths of latin-1 encoding/decoding
-rw-r--r--pypy/interpreter/test/test_unicodehelper.py7
-rw-r--r--pypy/interpreter/unicodehelper.py9
-rw-r--r--pypy/objspace/std/test/test_unicodeobject.py8
-rw-r--r--pypy/objspace/std/unicodeobject.py3
4 files changed, 23 insertions, 4 deletions
diff --git a/pypy/interpreter/test/test_unicodehelper.py b/pypy/interpreter/test/test_unicodehelper.py
index 4d849cd9cf..34e08da5ac 100644
--- a/pypy/interpreter/test/test_unicodehelper.py
+++ b/pypy/interpreter/test/test_unicodehelper.py
@@ -11,6 +11,7 @@ from rpython.rlib import rutf8
from pypy.interpreter.unicodehelper import str_decode_utf8
from pypy.interpreter.unicodehelper import utf8_encode_ascii, str_decode_ascii
+from pypy.interpreter.unicodehelper import utf8_encode_latin_1
from pypy.interpreter import unicodehelper as uh
from pypy.module._codecs.interp_codecs import CodecState
@@ -91,3 +92,9 @@ def test_encode_decimal(space):
result = uh.unicode_encode_decimal(
u'12\u1234'.encode('utf8'), 'xmlcharrefreplace', handler)
assert result == '12&#4660;'
+
+def test_utf8_encode_latin1_ascii_prefix():
+ utf8 = b'abcde\xc3\xa4g'
+ b = utf8_encode_latin_1(utf8, None, None)
+ assert b == b'abcde\xe4g'
+
diff --git a/pypy/interpreter/unicodehelper.py b/pypy/interpreter/unicodehelper.py
index 34fde1c874..d17ccb767b 100644
--- a/pypy/interpreter/unicodehelper.py
+++ b/pypy/interpreter/unicodehelper.py
@@ -155,14 +155,15 @@ def utf8_encode_latin_1(s, errors, errorhandler):
try:
rutf8.check_ascii(s)
return s
- except rutf8.CheckError:
- return _utf8_encode_latin_1_slowpath(s, errors, errorhandler)
+ except rutf8.CheckError, e:
+ return _utf8_encode_latin_1_slowpath(s, e.pos, errors, errorhandler)
-def _utf8_encode_latin_1_slowpath(s, errors, errorhandler):
+def _utf8_encode_latin_1_slowpath(s, first_non_ascii_char, errors, errorhandler):
size = len(s)
result = StringBuilder(size)
index = 0
- pos = 0
+ result.append_slice(s, 0, first_non_ascii_char)
+ pos = first_non_ascii_char
while pos < size:
ch = rutf8.codepoint_at_pos(s, pos)
if ch <= 0xFF:
diff --git a/pypy/objspace/std/test/test_unicodeobject.py b/pypy/objspace/std/test/test_unicodeobject.py
index e8763dc496..7f88ed9721 100644
--- a/pypy/objspace/std/test/test_unicodeobject.py
+++ b/pypy/objspace/std/test/test_unicodeobject.py
@@ -217,6 +217,14 @@ class TestUnicodeObject:
uniupper, = unicodedb.toupper_full(ch)
assert chr(uniupper) == chr(ch).upper()
+ def test_latin1_encode_shortcut_ascii(self, monkeypatch):
+ from rpython.rlib import rutf8
+ from pypy.objspace.std.unicodeobject import encode_object
+ monkeypatch.setattr(rutf8, "check_ascii", None)
+ w_b = encode_object(self.space, self.space.newutf8("abc", 3), "latin-1", "strict")
+ assert self.space.bytes_w(w_b) == "abc"
+
+
class AppTestUnicodeStringStdOnly:
def test_compares(self):
assert u'a' == 'a'
diff --git a/pypy/objspace/std/unicodeobject.py b/pypy/objspace/std/unicodeobject.py
index 0be4a9e55c..16edebfb03 100644
--- a/pypy/objspace/std/unicodeobject.py
+++ b/pypy/objspace/std/unicodeobject.py
@@ -1216,6 +1216,9 @@ def encode_object(space, w_obj, encoding, errors):
if rutf8.has_surrogates(utf8):
utf8 = rutf8.reencode_utf8_with_surrogates(utf8)
return space.newbytes(utf8)
+ if (encoding == "latin1" or encoding == "latin-1" and
+ isinstance(w_obj, W_UnicodeObject) and w_obj.is_ascii()):
+ return space.newbytes(w_obj._utf8)
return encode(space, w_obj, encoding, errors)