diff -r aa079dc20555 Lib/test/test_bytes.py --- a/Lib/test/test_bytes.py Sat May 11 19:28:25 2013 +0100 +++ b/Lib/test/test_bytes.py Sun May 12 22:50:42 2013 +0300 @@ -181,14 +181,14 @@ # parse time. Maybe someday we might push the error off to later, but for # now I'm just commenting this whole test out. # See http://bugs.jython.org/issue1836 for more. -# def test_encoding(self): -# sample = u"Hello world\n\u1234\u5678\u9abc\udef0" -# for enc in ("utf8", "utf16"): -# b = self.type2test(sample, enc) -# self.assertEqual(b, self.type2test(sample.encode(enc))) -# self.assertRaises(UnicodeEncodeError, self.type2test, sample, "latin1") -# b = self.type2test(sample, "latin1", "ignore") -# self.assertEqual(b, self.type2test(sample[:-4], "utf-8")) + def test_encoding(self): + sample = u"Hello world\n\u1234\u5678\u9abc\udef0" + for enc in ("utf8", "utf16"): + b = self.type2test(sample, enc) + self.assertEqual(b, self.type2test(sample.encode(enc))) + self.assertRaises(UnicodeEncodeError, self.type2test, sample, "latin1") + b = self.type2test(sample, "latin1", "ignore") + self.assertEqual(b, self.type2test(sample[:-4], "utf-8")) def test_decode(self): sample = u"Hello world\n\u1234\u5678\u9abc\def0\def0" diff -r aa079dc20555 Lib/test/test_unicode.py --- a/Lib/test/test_unicode.py Sat May 11 19:28:25 2013 +0100 +++ b/Lib/test/test_unicode.py Sun May 12 22:50:42 2013 +0300 @@ -185,51 +185,51 @@ # No surrogates, no fixup required. self.assert_(u'\u0061' < u'\u20ac') # Non surrogate below surrogate value, no fixup required - #self.assert_(u'\u0061' < u'\ud800\udc02') + self.assert_(u'\u0061' < u'\ud800\udc02') # Non surrogate above surrogate value, fixup required def test_lecmp(s, s2): self.assert_(s < s2) -# def test_fixup(s): -# s2 = u'\ud800\udc01' -# test_lecmp(s, s2) -# s2 = u'\ud900\udc01' -# test_lecmp(s, s2) -# s2 = u'\uda00\udc01' -# test_lecmp(s, s2) -# s2 = u'\udb00\udc01' -# test_lecmp(s, s2) -# s2 = u'\ud800\udd01' -# test_lecmp(s, s2) -# s2 = u'\ud900\udd01' -# test_lecmp(s, s2) -# s2 = u'\uda00\udd01' -# test_lecmp(s, s2) -# s2 = u'\udb00\udd01' -# test_lecmp(s, s2) -# s2 = u'\ud800\ude01' -# test_lecmp(s, s2) -# s2 = u'\ud900\ude01' -# test_lecmp(s, s2) -# s2 = u'\uda00\ude01' -# test_lecmp(s, s2) -# s2 = u'\udb00\ude01' -# test_lecmp(s, s2) -# s2 = u'\ud800\udfff' -# test_lecmp(s, s2) -# s2 = u'\ud900\udfff' -# test_lecmp(s, s2) -# s2 = u'\uda00\udfff' -# test_lecmp(s, s2) -# s2 = u'\udb00\udfff' -# test_lecmp(s, s2) + def test_fixup(s): + s2 = u'\ud800\udc01' + test_lecmp(s, s2) + s2 = u'\ud900\udc01' + test_lecmp(s, s2) + s2 = u'\uda00\udc01' + test_lecmp(s, s2) + s2 = u'\udb00\udc01' + test_lecmp(s, s2) + s2 = u'\ud800\udd01' + test_lecmp(s, s2) + s2 = u'\ud900\udd01' + test_lecmp(s, s2) + s2 = u'\uda00\udd01' + test_lecmp(s, s2) + s2 = u'\udb00\udd01' + test_lecmp(s, s2) + s2 = u'\ud800\ude01' + test_lecmp(s, s2) + s2 = u'\ud900\ude01' + test_lecmp(s, s2) + s2 = u'\uda00\ude01' + test_lecmp(s, s2) + s2 = u'\udb00\ude01' + test_lecmp(s, s2) + s2 = u'\ud800\udfff' + test_lecmp(s, s2) + s2 = u'\ud900\udfff' + test_lecmp(s, s2) + s2 = u'\uda00\udfff' + test_lecmp(s, s2) + s2 = u'\udb00\udfff' + test_lecmp(s, s2) -# test_fixup(u'\ue000') -# test_fixup(u'\uff61') + test_fixup(u'\ue000') + test_fixup(u'\uff61') # Surrogates on both sides, no fixup required - # self.assert_(u'\ud800\udc02' < u'\ud84d\udc56') + self.assert_(u'\ud800\udc02' < u'\ud84d\udc56') def test_islower(self): string_tests.MixinStrUnicodeUserStringTest.test_islower(self) @@ -362,8 +362,8 @@ self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def') self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def') - # self.assertEqual(u'%c' % 0x1234, u'\u1234') - # self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,)) + self.assertEqual(u'%c' % 0x1234, u'\u1234') + self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,)) # formatting jobs delegated from the string implementation: self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...') @@ -493,15 +493,14 @@ def test_codecs_utf8(self): self.assertEqual(u''.encode('utf-8'), '') self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac') - # Jython will not compile Unicode literals with surrogate units - #self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82') - #self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96') + self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82') + self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96') #self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80') #self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80') - #self.assertEqual( - # (u'\ud800\udc02'*1000).encode('utf-8'), - # '\xf0\x90\x80\x82'*1000 - #) + self.assertEqual( + (u'\ud800\udc02'*1000).encode('utf-8'), + '\xf0\x90\x80\x82'*1000 + ) self.assertEqual( u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f' u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00' diff -r aa079dc20555 src/org/python/core/PyString.java --- a/src/org/python/core/PyString.java Sat May 11 19:28:25 2013 +0100 +++ b/src/org/python/core/PyString.java Sun May 12 22:50:42 2013 +0300 @@ -228,7 +228,7 @@ continue; } /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */ - else if (ch >= 0xD800 && ch < 0xDC00) { + else if (ch >= 0xD800 && ch < 0xDC00 && i < str.length()) { char ch2 = str.charAt(i++); size--; if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { @@ -519,9 +519,7 @@ /*pass in an int since this can be a UCS-4 character */ private static boolean storeUnicodeCharacter(int value, StringBuilder partialDecode) { - if (value < 0 || (value >= 0xD800 && value <= 0xDFFF)) { - return false; - } else if (value <= PySystemState.maxunicode) { + if (value >= 0 && value <= PySystemState.maxunicode) { partialDecode.appendCodePoint(value); return true; } diff -r aa079dc20555 src/org/python/core/PyUnicode.java --- a/src/org/python/core/PyUnicode.java Sat May 11 19:28:25 2013 +0100 +++ b/src/org/python/core/PyUnicode.java Sun May 12 22:50:42 2013 +0300 @@ -303,10 +303,14 @@ int k = 0; while (i > 0) { int W1 = getString().charAt(k); + k++; if (W1 >= 0xD800 && W1 < 0xDC00) { - k += 2; - } else { - k += 1; + if (k < getString().length()) { + int W2 = getString().charAt(k); + if (W2 >= 0xDC00 && W2 < 0xE000) { + k++; + } + } } i--; } @@ -350,13 +354,14 @@ private int nextCodePoint() { int U; int W1 = getString().charAt(k); - if (W1 >= 0xD800 && W1 < 0xDC00) { - int W2 = getString().charAt(k + 1); - U = (((W1 & 0x3FF) << 10) | (W2 & 0x3FF)) + 0x10000; - k += 2; - } else { - U = W1; - k += 1; + k++; + U = W1; + if (W1 >= 0xD800 && W1 < 0xDC00 && k < getString().length()) { + int W2 = getString().charAt(k); + if (W2 >= 0xDC00 && W2 < 0xE000) { + k++; + U = (((W1 & 0x3FF) << 10) | (W2 & 0x3FF)) + 0x10000; + } } return U; } diff -r aa079dc20555 src/org/python/core/codecs.java --- a/src/org/python/core/codecs.java Sat May 11 19:28:25 2013 +0100 +++ b/src/org/python/core/codecs.java Sun May 12 22:50:42 2013 +0300 @@ -1782,15 +1782,15 @@ private int nextCodePoint() { int U; - // System.out.println("k=" + k); int W1 = s.charAt(k); - if (W1 >= 0xD800 && W1 < 0xDC00) { - int W2 = s.charAt(k + 1); - U = (((W1 & 0x3FF) << 10) | (W2 & 0x3FF)) + 0x10000; - k += 2; - } else { - U = W1; - k += 1; + k++; + U = W1; + if (W1 >= 0xD800 && W1 < 0xDC00 && k < s.length()) { + int W2 = s.charAt(k); + if (W2 >= 0xDC00 && W2 < 0xE000) { + k++; + U = (((W1 & 0x3FF) << 10) | (W2 & 0x3FF)) + 0x10000; + } } return U; }