Index: src/org/python/antlr/GrammarActions.java =================================================================== --- src/org/python/antlr/GrammarActions.java (revision 6421) +++ src/org/python/antlr/GrammarActions.java (working copy) @@ -9,6 +9,7 @@ import org.python.core.PyLong; import org.python.core.PyString; import org.python.core.PyUnicode; +import org.python.core.codecs; import org.python.antlr.ast.alias; import org.python.antlr.ast.arguments; import org.python.antlr.ast.boolopType; @@ -441,8 +442,12 @@ ustring); } } else if (raw) { - // Raw str without an encoding or raw unicode: simply passthru + // Raw str without an encoding or raw unicode string = string.substring(start, end); + if (ustring) { + // Raw unicode: handle unicode escapes + string = codecs.PyUnicode_DecodeRawUnicodeEscape(string, "strict"); + } } else { // Plain unicode: already decoded, just handle escapes string = PyString.decode_UnicodeEscape(string, start, end, "strict", ustring); Index: src/org/python/core/codecs.java =================================================================== --- src/org/python/core/codecs.java (revision 6417) +++ src/org/python/core/codecs.java (working copy) @@ -920,31 +920,40 @@ private static char[] hexdigit = "0123456789ABCDEF".toCharArray(); // The modified flag is used by cPickle. - public static String PyUnicode_EncodeRawUnicodeEscape(String str, - String errors, - boolean modifed) { - - int size = str.length(); - StringBuilder v = new StringBuilder(str.length()); - - for (int i = 0; i < size; i++) { - char ch = str.charAt(i); - if (ch >= 256 || (modifed && (ch == '\n' || ch == '\\'))) { + public static String PyUnicode_EncodeRawUnicodeEscape(String str, String errors, + boolean modifed) { + // XXX: Not very efficient + int[] codePoints = new PyUnicode(str).toCodePoints(); + StringBuilder v = new StringBuilder(10 * codePoints.length); + + for (int codePoint : codePoints) { + if (codePoint >= 0x10000) { + // Map 32-bit characters to '\\Uxxxxxxxx' + v.append("\\U"); + v.append(hexdigit[(codePoint >>> 28) & 0xF]); + v.append(hexdigit[(codePoint >>> 24) & 0xF]); + v.append(hexdigit[(codePoint >>> 20) & 0xF]); + v.append(hexdigit[(codePoint >>> 16) & 0xF]); + v.append(hexdigit[(codePoint >>> 12) & 0xF]); + v.append(hexdigit[(codePoint >>> 8) & 0xF]); + v.append(hexdigit[(codePoint >>> 4) & 0xF]); + v.append(hexdigit[codePoint & 0xF]); + } else if (codePoint >= 256 || (modifed && (codePoint == '\n' || codePoint == '\\'))) { + // Map 16-bit chararacters to '\\uxxxx' v.append("\\u"); - v.append(hexdigit[(ch >>> 12) & 0xF]); - v.append(hexdigit[(ch >>> 8) & 0xF]); - v.append(hexdigit[(ch >>> 4) & 0xF]); - v.append(hexdigit[ch & 0xF]); + v.append(hexdigit[(codePoint >>> 12) & 0xF]); + v.append(hexdigit[(codePoint >>> 8) & 0xF]); + v.append(hexdigit[(codePoint >>> 4) & 0xF]); + v.append(hexdigit[codePoint & 0xF]); } else { - v.append(ch); + v.appendCodePoint(codePoint); } } return v.toString(); } - public static String PyUnicode_DecodeRawUnicodeEscape(String str, - String errors) { + public static String PyUnicode_DecodeRawUnicodeEscape(String str, String errors) { int size = str.length(); StringBuilder v = new StringBuilder(size); for (int i = 0; i < size;) { @@ -968,14 +977,17 @@ v.append(ch); i++; } - if (((i - bs) & 1) == 0 || i >= size || ch != 'u') { + if (((i - bs) & 1) == 0 + || i >= size + || (ch != 'u' && ch != 'U')) { continue; } v.setLength(v.length() - 1); + int count = ch == 'u' ? 4 : 8; i++; - /* \\uXXXX with 4 hex digits */ + /* \\uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ int x = 0, d = 0, j = 0; - for (; j < 4; j++) { + for (; j < count; j++) { ch = str.charAt(i + j); d = Character.digit(ch, 16); if (d == -1) { @@ -992,8 +1004,8 @@ i + j, "truncated \\uXXXX"); } else { - i += 4; - v.append((char) x); + i += count; + v.appendCodePoint(x); } } return v.toString(); Index: Lib/test/test_unicode_jy.py =================================================================== --- Lib/test/test_unicode_jy.py (revision 6417) +++ Lib/test/test_unicode_jy.py (working copy) @@ -51,6 +51,27 @@ self.assertEqual(ord(bar[2]), 92) self.assertEqual(ord(bar[3]), 110) + for baz in ur'Hello\u0020World !', ur'Hello\U00000020World !': + self.assertEqual(len(baz), 13, repr(baz)) + self.assertEqual(repr(baz), "u'Hello World !'") + self.assertEqual(ord(baz[5]), 32) + + quux = ur'\U00100000' + self.assertEqual(repr(quux), "u'\\U00100000'") + if sys.maxunicode == 0xffff: + self.assertEqual(len(quux), 2) + self.assertEqual(ord(quux[0]), 56256) + self.assertEqual(ord(quux[1]), 56320) + else: + self.assertEqual(len(quux), 1) + self.assertEqual(ord(quux), 1048576) + + def test_raw_unicode_escape(self): + foo = u'\U00100000' + self.assertEqual(foo.encode('raw_unicode_escape'), '\\U00100000') + self.assertEqual(foo.encode('raw_unicode_escape').decode('raw_unicode_escape'), + foo) + def test_encode_decimal(self): self.assertEqual(int(u'\u0039\u0032'), 92) self.assertEqual(int(u'\u0660'), 0) Index: NEWS =================================================================== --- NEWS (revision 6418) +++ NEWS (working copy) @@ -9,6 +9,7 @@ Fix file's repr with Windows paths Fix urllib and urllib2 path handling on Windows Fix r'\Jython25' not considered an abspath on Windows + Fix handling of raw unicode escapes Jython 2.5.0 rc3 Bugs fixed