Index: src/org/python/core/codecs.java =================================================================== --- src/org/python/core/codecs.java (revision 2833) +++ src/org/python/core/codecs.java (working copy) @@ -8,16 +8,29 @@ package org.python.core; + + /** * Contains the implementation of the builtin codecs. * @since Jython 2.0 */ public class codecs { + + + public static final String BACKSLASHREPLACE = "backslashreplace"; + + public static final String IGNORE = "ignore"; + + public static final String REPLACE = "replace"; + + public static final String XMLCHARREFREPLACE = "xmlcharrefreplace"; + private static char Py_UNICODE_REPLACEMENT_CHARACTER = 0xFFFD; - private static PyList searchPath = new PyList(); - private static PyStringMap searchCache = new PyStringMap(); + private static PyList searchPath; + private static PyStringMap searchCache; + private static PyStringMap errorHandlers; private static String default_encoding = "ascii"; @@ -29,8 +42,30 @@ lookup(encoding); default_encoding = encoding; } + + public static PyObject lookup_error(String handlerName){ + registry_init(); + if(handlerName == null){ + handlerName = "strict"; + } + PyObject handler = (PyObject)errorHandlers.__finditem__(handlerName.intern()); + if(handler == null){ + throw new PyException(Py.LookupError, + "unknown error handler name '" + handlerName + "'"); + } + return handler; + } + + public static void register_error(String name, PyObject error){ + registry_init(); + if (!error.isCallable()) { + throw Py.TypeError("argument must be callable"); + } + errorHandlers.__setitem__(name.intern(), error); + } public static void register(PyObject search_function) { + registry_init(); if (!search_function.isCallable()) { throw Py.TypeError("argument must be callable"); } @@ -39,7 +74,7 @@ public static PyTuple lookup(String encoding) { - import_encodings(); + registry_init(); PyString v = new PyString(normalizestring(encoding)); PyObject result = searchCache.__finditem__(v); if (result != null) { @@ -108,15 +143,7 @@ errors = errors.intern(); } - /* Shortcuts for common default encodings */ -/* - if (encoding.equals("utf-8")) - return utf_8_decode(v, errors).__getitem__(0).__str__(); - else if (encoding.equals("latin-1")) - ; //return PyUnicode_DecodeLatin1(s, size, errors); - else if (encoding.equals("ascii")) - ; //return PyUnicode_DecodeASCII(s, size, errors); -*/ + /* Shortcut for ascii encoding */ if (encoding.equals("ascii")) { return PyUnicode_DecodeASCII(v.toString(), v.__len__(), errors); @@ -159,16 +186,12 @@ errors = errors.intern(); } - /* Shortcuts for common default encodings */ -/* - if (encoding.equals("utf-8")) - return PyUnicode_DecodeUTF8(v.toString(), v.__len__(), errors); - else if (encoding.equals("latin-1")) - return PyUnicode_DecodeLatin1(v.toString(), v.__len__(), errors); - else -*/ + /* Shortcuts for common default encodings. latin-1 must not use the + * lookup registry for the encodigs module to work correctly */ + if (encoding.equals("latin-1")){ + return PyUnicode_EncodeLatin1(v.toString(), v.__len__(), errors); - if (encoding.equals("ascii")) { + }else if (encoding.equals("ascii")) { return PyUnicode_EncodeASCII(v.toString(), v.__len__(), errors); } @@ -193,8 +216,465 @@ PyObject codecs = lookup(encoding); return codecs.__getitem__(0); } + + public static PyObject strict_errors(PyObject[] args, String[] kws){ + ArgParser ap = new ArgParser("strict_errors", args, kws, "exc"); + PyObject exc = ap.getPyObject(0); + if(Py.isInstance(exc, Py.UnicodeDecodeError)){ + throw new PyException(Py.UnicodeDecodeError, exc); + }else if(Py.isInstance(exc, Py.UnicodeEncodeError)){ + throw new PyException(Py.UnicodeEncodeError, exc); + }else if(Py.isInstance(exc, Py.UnicodeTranslateError)){ + throw new PyException(Py.UnicodeTranslateError, exc); + } + throw wrong_exception_type(exc); + } + + public static PyObject ignore_errors(PyObject[] args, String[] kws){ + ArgParser ap = new ArgParser("ignore_errors", args, kws, "exc"); + PyObject exc = ap.getPyObject(0); + if(!isUnicodeError(exc)){ + throw wrong_exception_type(exc); + } + PyObject end = exc.__getattr__("end"); + return new PyTuple(new PyObject[]{Py.java2py(""), end}); + } + private static boolean isUnicodeError(PyObject exc) { + return Py.isInstance(exc, Py.UnicodeDecodeError) || + Py.isInstance(exc, Py.UnicodeEncodeError) || + Py.isInstance(exc, Py.UnicodeTranslateError); + } + + public static PyObject replace_errors(PyObject[] args, String[] kws){ + ArgParser ap = new ArgParser("replace_errors", args, kws, "exc"); + PyObject exc = ap.getPyObject(0); + if(Py.isInstance(exc, Py.UnicodeDecodeError)){ + PyObject end = exc.__getattr__("end"); + return new PyTuple(new PyObject[]{new PyUnicode(Py_UNICODE_REPLACEMENT_CHARACTER), end}); + }else if(Py.isInstance(exc, Py.UnicodeEncodeError)){ + PyObject end = exc.__getattr__("end"); + return new PyTuple(new PyObject[]{Py.java2py("?"), end}); + }else if(Py.isInstance(exc, Py.UnicodeTranslateError)){ + PyObject end = exc.__getattr__("end"); + return new PyTuple(new PyObject[]{new PyUnicode(Py_UNICODE_REPLACEMENT_CHARACTER), end}); + } + throw wrong_exception_type(exc); + } + + public static PyObject xmlcharrefreplace_errors(PyObject[] args, String[] kws){ + ArgParser ap = new ArgParser("xmlcharrefreplace_errors", args, kws, "exc"); + PyObject exc = ap.getPyObject(0); + if(!Py.isInstance(exc, Py.UnicodeEncodeError)){ + throw wrong_exception_type(exc); + } + int start = ((PyInteger)exc.__getattr__("start")).getValue(); + int end = ((PyInteger)exc.__getattr__("end")).getValue(); + String object = exc.__getattr__("object").toString(); + StringBuffer replacement = new StringBuffer(); + xmlcharrefreplace_internal(start, end, object, replacement); + return new PyTuple(new PyObject[]{Py.java2py(replacement.toString()), exc.__getattr__("end")}); + } + + public static StringBuffer xmlcharrefreplace(int start, int end, String toReplace){ + StringBuffer replacement = new StringBuffer(); + xmlcharrefreplace_internal(start, end, toReplace, replacement); + return replacement; + } + private static void xmlcharrefreplace_internal(int start, int end, String object, StringBuffer replacement) { + for(int i = start; i < end; i++) { + replacement.append("&#"); + char cur = object.charAt(i); + int digits; + int base; + if(cur < 10) { + digits = 1; + base = 1; + } else if(cur < 100) { + digits = 2; + base = 10; + } else if(cur < 1000) { + digits = 3; + base = 100; + } else if(cur < 10000) { + digits = 4; + base = 1000; + } else if(cur < 100000) { + digits = 5; + base = 10000; + } else if(cur < 1000000) { + digits = 6; + base = 100000; + } else { + digits = 7; + base = 1000000; + } + while(digits-- > 0) { + replacement.append((char)('0' + cur / base)); + cur %= base; + base /= 10; + } + replacement.append(';'); + } + } + + private static PyException wrong_exception_type(PyObject exc) { + PyObject excClass = exc.__getattr__("__class__"); + PyObject className = excClass.__getattr__("__name__"); + return new PyException(Py.TypeError, "Don't know how to handle " + + className + " in error callback"); + } + + + static char hexdigits[] = { + '0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' + }; + + public static PyObject backslashreplace_errors(PyObject[] args, String[] kws){ + ArgParser ap = new ArgParser("backslashreplace_errors", args, kws, "exc"); + PyObject exc = ap.getPyObject(0); + if(!Py.isInstance(exc, Py.UnicodeEncodeError)){ + throw wrong_exception_type(exc); + } + int start = ((PyInteger)exc.__getattr__("start")).getValue(); + int end = ((PyInteger)exc.__getattr__("end")).getValue(); + String object = exc.__getattr__("object").toString(); + StringBuffer replacement = new StringBuffer(); + backslashreplace_internal(start, end, object, replacement); + return new PyTuple(new PyObject[]{Py.java2py(replacement.toString()), exc.__getattr__("end")}); + } + + public static StringBuffer backslashreplace(int start, int end, String toReplace){ + StringBuffer replacement = new StringBuffer(); + backslashreplace_internal(start, end, toReplace, replacement); + return replacement; + } + + private static void backslashreplace_internal(int start, int end, String object, StringBuffer replacement) { + for(int i = start; i < end; i++) { + replacement.append('\\'); + char c = object.charAt(i); + if(c >= 0x00010000) { + replacement.append('U'); + replacement.append(hexdigits[(c >> 28) & 0xf]); + replacement.append(hexdigits[(c >> 24) & 0xf]); + replacement.append(hexdigits[(c >> 20) & 0xf]); + replacement.append(hexdigits[(c >> 16) & 0xf]); + replacement.append(hexdigits[(c >> 12) & 0xf]); + replacement.append(hexdigits[(c >> 8) & 0xf]); + } else if(c >= 0x100) { + replacement.append('u'); + replacement.append(hexdigits[(c >> 12) & 0xf]); + replacement.append(hexdigits[(c >> 8) & 0xf]); + } else + replacement.append('x'); + replacement.append(hexdigits[(c >> 4) & 0xf]); + replacement.append(hexdigits[c & 0xf]); + } + } + + private static void registry_init(){ + if(searchPath != null) + return; + searchPath = new PyList(); + searchCache = new PyStringMap(); + errorHandlers = new PyStringMap(); + String[] builtinErrorHandlers = new String[] {"strict", + IGNORE, + REPLACE, + XMLCHARREFREPLACE, + BACKSLASHREPLACE}; + for(int i = 0; i < builtinErrorHandlers.length; i++) { + register_error(builtinErrorHandlers[i], + Py.newJavaFunc(codecs.class, builtinErrorHandlers[i] + + "_errors")); + } + import_encodings(); + } + /* --- UTF-7 Codec -------------------------------------------------------- */ + + /* see RFC2152 for details */ + + + public static + char utf7_special[] = { + /* + * indicate whether a UTF-7 character is special i.e. cannot be directly + * encoded: 0 - not special 1 - special 2 - whitespace (optional) 3 - + * RFC2152 Set O (optional) + */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1, + + }; + + private static boolean SPECIAL(char c, boolean encodeO, boolean encodeWS){ + return (c>127 || utf7_special[(c)] == 1) || + (encodeWS && (utf7_special[(c)] == 2)) || + (encodeO && (utf7_special[(c)] == 3)); + } + + private static final String B64_CHARS = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + + private static char B64(int n){ + return B64_CHARS.charAt(n & 0x3f); + } + + private static boolean B64CHAR(char c) { + return B64_CHARS.indexOf(c) != -1; + } + + private static int UB64(char c) { + return ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? + (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4); + } + + public static String PyUnicode_DecodeUTF7(String str, + String errors) { + int s = 0; + int e = str.length(); + boolean inShift = false; + int bitsInCharsleft = 0; + long charsleft = 0; + boolean surrogate = false; + char highOrderSurrogate = 0; + StringBuffer unicode = new StringBuffer(e); + while(s < e) { + // restart: + char ch = str.charAt(s); + if(inShift) { + if((ch == '-') || !B64CHAR(ch)) { + inShift = false; + s++; + while(bitsInCharsleft >= 16) { + bitsInCharsleft -= 16; + char outCh = (char)((charsleft >> bitsInCharsleft) & 0xffff); + if(surrogate) { + if(0xD800 <= outCh && outCh <= 0xDBFF) { + unicode.append(highOrderSurrogate); + unicode.append(outCh); + } else { + s = codecs.insertReplacementAndGetResume(unicode, + errors, + "utf-16", + str, + s, + s + 1, + "illegal UTF-16 surrogate"); + } + surrogate = false; + } else if(0xDC00 <= outCh && outCh <= 0xDFFF) { + surrogate = true; + highOrderSurrogate = outCh; + } else { + unicode.append(outCh); + } + } + if(bitsInCharsleft >= 6) { + /* + * The shift sequence has a partial character in it. If + * bitsleft < 6 then we could just classify it as + * padding but that is not the case here + */ + s = insertReplacementAndGetResume(unicode, + errors, + "utf-7", + str, + s, + s + 1, + "partial character in shift sequence"); + } + /* + * According to RFC2152 the remaining bits should be zero. + * We choose to signal an error/insert a replacement + * character here so indicate the potential of a misencoded + * character. + */ + if(bitsInCharsleft > 0 && ((charsleft << 5 - bitsInCharsleft) & 0x1f) > 0){ + s = insertReplacementAndGetResume(unicode, + errors, + "utf-7", + str, + s, + s + 1, + "non-zero padding bits in shift sequence"); + } + if(ch == '-') { + if((s < e) && (str.charAt(s) == '-')) { + unicode.append('-'); + inShift = true; + } + } else if(SPECIAL(ch, false, false)) { + s = insertReplacementAndGetResume(unicode, + errors, + "utf-7", + str, + s, + s + 1, + "unexpected special character"); + } else { + unicode.append(ch); + } + } else { + charsleft = (charsleft << 6) | UB64(ch); + bitsInCharsleft += 6; + s++; + while(bitsInCharsleft >= 16) { + bitsInCharsleft -= 16; + char outCh = (char)((charsleft >> bitsInCharsleft) & 0xffff); + if(surrogate) { + if(0xD800 <= outCh && outCh <= 0xDBFF) { + unicode.append(highOrderSurrogate); + unicode.append(outCh); + } else { + s = codecs.insertReplacementAndGetResume(unicode, + errors, + "utf-16", + str, + s, + s + 1, + "illegal UTF-16 surrogate"); + } + surrogate = false; + } else if(0xDC00 <= outCh && outCh <= 0xDFFF) { + surrogate = true; + highOrderSurrogate = outCh; + } else { + unicode.append(outCh); + } + } + } + } else if(ch == '+') { + s++; + if(s < e && str.charAt(s) == '-') { + s++; + unicode.append('+'); + } else { + inShift = true; + bitsInCharsleft = 0; + } + } else if(SPECIAL(ch, false, false)) { + s = insertReplacementAndGetResume(unicode, + errors, + "utf-7", + str, + s, + s + 1, + "unexpected special character"); + } else { + unicode.append(ch); + s++; + } + if(inShift && s == e) { + s = insertReplacementAndGetResume(unicode, + errors, + "utf-7", + str, + s, + s, + "unterminated shift sequence"); + } + } + return unicode.toString(); + } + + + public static String PyUnicode_EncodeUTF7(String str, + boolean encodeSetO, + boolean encodeWhiteSpace, + String errors) + { + int size = str.length(); + + if (size == 0) + return ""; + boolean inShift = false; + int bitsleft = 0; + int charsleft = 0; + + StringBuffer v = new StringBuffer(); + + for (int i = 0;i < size; ++i) { + char ch = str.charAt(i); + + if (!inShift) { + if (ch == '+') { + v.append('+'); + v.append('-'); + } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { + charsleft = ch; + bitsleft = 16; + v.append('+'); + while (bitsleft >= 6) { + v.append(B64(charsleft >> (bitsleft-6))); + bitsleft -= 6; + } + inShift = bitsleft > 0; + } else { + v.append((char) ch); + } + } else { + if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) { + v.append(B64(charsleft << (6-bitsleft))); + charsleft = 0; + bitsleft = 0; + /* Characters not in the BASE64 set implicitly unshift the sequence + so no '-' is required, except if the character is itself a '-' */ + if (B64CHAR(ch) || ch == '-') { + v.append('-'); + } + inShift = false; + v.append( ch); + } else { + bitsleft += 16; + charsleft = (charsleft << 16) | ch; + while (bitsleft >= 6) { + v.append(B64(charsleft >> (bitsleft-6))); + bitsleft -= 6; + } + /* If the next character is special then we dont' need to terminate + the shift sequence. If the next character is not a BASE64 character + or '-' then the shift sequence will be terminated implicitly and we + don't have to insert a '-'. */ + + if (bitsleft == 0) { + if (i + 1 < size) { + char ch2 = str.charAt(i+1); + + if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) { + + } else if (B64CHAR(ch2) || ch2 == '-') { + v.append('-'); + inShift = false; + } else { + inShift = false; + } + + } + else { + v.append('-'); + inShift = false; + } + } + } + } + } + if (bitsleft > 0) { + v.append(B64(charsleft << (6-bitsleft))); + v.append('-'); + } + return v.toString(); + } + + /* --- UTF-8 Codec ---------------------------------------------------- */ private static byte utf8_code_length[] = { /* Map UTF-8 encoded prefix byte to sequence length. zero means @@ -225,53 +705,41 @@ /* Unpack UTF-8 encoded data */ for (int i = 0; i < size; ) { int ch = str.charAt(i); - if (ch > 0xFF) { - codecs.decoding_error("utf-8", unicode, errors, - "ordinal not in range(255)"); - i++; - continue; - } if (ch < 0x80) { unicode.append((char) ch); i++; continue; } + if (ch > 0xFF) { + i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 1, "ordinal not in range(255)"); + continue; + } int n = utf8_code_length[ch]; if (i + n > size) { - codecs.decoding_error("utf-8", unicode, errors, - "unexpected end of data"); - i++; + i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 1, "unexpected end of data"); continue; } switch (n) { case 0: - codecs.decoding_error("utf-8", unicode, errors, - "unexpected code byte"); - i++; + i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 1, "unexpected code byte"); continue; case 1: - codecs.decoding_error("utf-8", unicode, errors, - "internal error"); - i++; + i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 1, "internal error"); continue; case 2: char ch1 = str.charAt(i+1); if ((ch1 & 0xc0) != 0x80) { - codecs.decoding_error("utf-8", unicode, errors, - "invalid data"); - i++; + i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 2, "invalid data"); continue; } ch = ((ch & 0x1f) << 6) + (ch1 & 0x3f); if (ch < 0x80) { - codecs.decoding_error("utf-8", unicode, errors, - "illegal encoding"); - i++; + i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 2, "illegal encoding"); continue; } else unicode.append((char) ch); @@ -281,16 +749,12 @@ ch1 = str.charAt(i+1); char ch2 = str.charAt(i+2); if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80) { - codecs.decoding_error("utf-8", unicode, errors, - "invalid data"); - i++; + i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 3, "invalid data"); continue; } ch = ((ch & 0x0f) << 12) + ((ch1 & 0x3f) << 6) + (ch2 & 0x3f); if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) { - codecs.decoding_error("utf-8", unicode, errors, - "illegal encoding"); - i++; + i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 3, "illegal encoding"); continue; } else unicode.append((char) ch); @@ -303,9 +767,7 @@ if ((ch1 & 0xc0) != 0x80 || (ch2 & 0xc0) != 0x80 || (ch3 & 0xc0) != 0x80) { - codecs.decoding_error("utf-8", unicode, errors, - "invalid data"); - i++; + i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 4, "invalid data"); continue; } ch = ((ch & 0x7) << 18) + ((ch1 & 0x3f) << 12) + @@ -315,9 +777,7 @@ byte encoding */ (ch > 0x10ffff)) { /* maximum value allowed for UTF-16 */ - codecs.decoding_error("utf-8", unicode, errors, - "illegal encoding"); - i++; + i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + 4, "illegal encoding"); continue; } /* compute and append the two surrogates: */ @@ -334,9 +794,8 @@ default: /* Other sizes are only needed for UCS-4 */ - codecs.decoding_error("utf-8", unicode, errors, - "unsupported Unicode code range"); - i++; + i = insertReplacementAndGetResume(unicode, errors, "utf-8", str, i, i + n, "unsupported Unicode code range"); + continue; } i += n; } @@ -380,22 +839,33 @@ } - - /* --- 7-bit ASCII Codec -------------------------------------------- */ - public static String PyUnicode_DecodeASCII(String str, int size, String errors) { + return PyUnicode_DecodeIntLimited(str, size, errors, "ascii", 128); + } + + public static String PyUnicode_DecodeLatin1(String str, int size, + String errors) + { + return PyUnicode_DecodeIntLimited(str, size, errors, "latin-1", 256); + } + + private static String PyUnicode_DecodeIntLimited(String str, int size,String errors, String encoding, int limit){ StringBuffer v = new StringBuffer(size); + String reason = "ordinal not in range(" + limit + ")"; for (int i = 0; i < size; i++) { char ch = str.charAt(i); - if (ch < 128) { + if (ch < limit) { v.append(ch); } else { - decoding_error("ascii", v, errors, - "ordinal not in range(128)"); - continue; + i = insertReplacementAndGetResume(v,errors, + encoding, + str, + i, + i + 1, + reason) - 1; } } @@ -406,19 +876,84 @@ public static String PyUnicode_EncodeASCII(String str, int size, String errors) { + return PyUnicode_EncodeIntLimited(str, size, errors, "ascii", 128); + } + + + public static String PyUnicode_EncodeLatin1(String str, int size, + String errors) + { + + return PyUnicode_EncodeIntLimited(str, size, errors, "latin-1", 256); + } + + + private static String PyUnicode_EncodeIntLimited(String str, int size, + String errors, String encoding, int limit) + { + String reason = "ordinal not in range(" + limit + ")"; StringBuffer v = new StringBuffer(size); - - for (int i = 0; i < size; i++) { + for(int i = 0; i < size; i++) { char ch = str.charAt(i); - if (ch >= 128) { - encoding_error("ascii", v, errors, - "ordinal not in range(128)"); + if(ch >= limit) { + int nextGood = i + 1; + for(; nextGood < size; nextGood++) { + if(str.charAt(nextGood) < limit) { + break; + } + } + if(errors != null) { + if(errors.equals(IGNORE)) { + i = nextGood - 1; + continue; + } else if(errors.equals(REPLACE)) { + for(int j = i; j < nextGood; j++) { + v.append('?'); + } + i = nextGood - 1; + continue; + } else if(errors.equals(XMLCHARREFREPLACE)) { + v.append(xmlcharrefreplace(i, nextGood, str)); + i = nextGood - 1; + continue; + } else if(errors.equals(BACKSLASHREPLACE)) { + v.append(backslashreplace(i, nextGood, str)); + i = nextGood - 1; + continue; + } + } + PyObject replacement = encoding_error(errors, + encoding, + str, + i, + nextGood, + reason); + String replStr = replacement.__getitem__(0).toString(); + for(int j = 0; j < replStr.length(); j++) { + if(replStr.charAt(j) >= limit) { + throw Py.UnicodeEncodeError(encoding, str, i + j, i + j + + 1, reason); + } + } + v.append(replStr); + i = calcNewPosition(size, replacement) - 1; } else { v.append(ch); } } return v.toString(); } + + public static int calcNewPosition(int size, PyObject errorTuple) { + int newPosition = ((PyInteger)errorTuple.__getitem__(1)).getValue(); + if(newPosition < 0){ + newPosition = size + newPosition; + } + if(newPosition > size || newPosition < 0){ + throw Py.IndexError(newPosition + " out of bounds of encoded string"); + } + return newPosition; + } @@ -457,85 +992,128 @@ { int size = str.length(); StringBuffer v = new StringBuffer(size); - - for (int i = 0; i < size; ) { + for(int i = 0; i < size;) { char ch = str.charAt(i); - /* Non-escape characters are interpreted as Unicode ordinals */ - if (ch != '\\') { + if(ch != '\\') { v.append(ch); i++; continue; } - - /* \\u-escapes are only interpreted iff the number of leading - backslashes is odd */ + /* + * \\u-escapes are only interpreted iff the number of leading + * backslashes is odd + */ int bs = i; - while (i < size) { + while(i < size) { ch = str.charAt(i); - if (ch != '\\') + if(ch != '\\') break; v.append(ch); i++; } - if (((i - bs) & 1) == 0 || i >= size || ch != 'u') { + if(((i - bs) & 1) == 0 || i >= size || ch != 'u') { continue; } v.setLength(v.length() - 1); i++; - /* \\uXXXX with 4 hex digits */ - int x = 0; - for (int j = 0; j < 4; j++) { - ch = str.charAt(i+j); - int d = Character.digit(ch, 16); - if (d == -1) { - codecs.decoding_error("unicode escape", v, errors, - "truncated \\uXXXX"); + int x = 0, d = 0, j = 0; + for(; j < 4; j++) { + ch = str.charAt(i + j); + d = Character.digit(ch, 16); + if(d == -1) { break; } - x = ((x<<4) & ~0xF) + d; + x = ((x << 4) & ~0xF) + d; } - i += 4; - v.append((char) x); - } - return v.toString(); + if(d == -1) { + i = codecs.insertReplacementAndGetResume(v, + errors, + "unicodeescape", + str, + bs, + i + j, + "truncated \\uXXXX"); + } else { + i += 4; + v.append((char)x); + } + } + return v.toString(); } - /* --- Utility methods -------------------------------------------- */ + public static PyObject encoding_error(String errors, + String encoding, + String toEncode, + int start, + int end, + String reason) { + PyObject errorHandler = lookup_error(errors); + PyException exc = Py.UnicodeEncodeError(encoding, + toEncode, + start, + end, + reason); + exc.instantiate(); + PyObject replacement = errorHandler.__call__(new PyObject[] {exc.value}); + checkErrorHandlerReturn(errors, replacement); + return replacement; + } - public static void encoding_error(String type, StringBuffer dest, - String errors, String details) - { - if (errors == null || errors == "strict") { - throw Py.UnicodeError(type + " encoding error: " + details); - } else if (errors == "ignore") { - //ignore - } else if (errors == "replace") { - dest.append('?'); - } else { - throw Py.ValueError(type + " encoding error; "+ - "unknown error handling code: " + errors); + public static int insertReplacementAndGetResume(StringBuffer partialDecode, + String errors, + String encoding, + String toDecode, + int start, + int end, + String reason) { + if(errors != null) { + if(errors.equals(IGNORE)) { + return end; + } else if(errors.equals(REPLACE)) { + while(start < end) { + partialDecode.append(Py_UNICODE_REPLACEMENT_CHARACTER); + start++; + } + return end; + } } + PyObject replacement = decoding_error(errors, + encoding, + toDecode, + start, + end, + reason); + checkErrorHandlerReturn(errors, replacement); + partialDecode.append(replacement.__getitem__(0).toString()); + return calcNewPosition(toDecode.length(), replacement); } + public static PyObject decoding_error(String errors, + String encoding, + String toEncode, + int start, + int end, + String reason) { + PyObject errorHandler = lookup_error(errors); + PyException exc = Py.UnicodeDecodeError(encoding, + toEncode, + start, + end, + reason); + exc.instantiate(); + return errorHandler.__call__(new PyObject[] {exc.value}); + } - public static void decoding_error(String type, StringBuffer dest, - String errors, String details) - { - if (errors == null || errors == "strict") { - throw Py.UnicodeError(type + " decoding error: " + details); + private static void checkErrorHandlerReturn(String errors, + PyObject replacement) { + if(!(replacement instanceof PyTuple) || replacement.__len__() != 2 + || !(replacement.__getitem__(0) instanceof PyBaseString) + || !(replacement.__getitem__(1) instanceof PyInteger)) { + throw new PyException(Py.TypeError, "error_handler " + errors + + " must return a tuple of (replacement, new position)"); } - else if (errors == "ignore") { - //ignore - } else if (errors == "replace") { - if (dest != null) { - dest.append(Py_UNICODE_REPLACEMENT_CHARACTER); - } - } else { - throw Py.ValueError(type + " decoding error; "+ - "unknown error handling code: " + errors); - } } } Index: src/org/python/core/PyString.java =================================================================== --- src/org/python/core/PyString.java (revision 2833) +++ src/org/python/core/PyString.java (working copy) @@ -1614,7 +1614,7 @@ public PyObject __call__(PyObject arg0) { String result=self.str_join(arg0); //XXX: do we really need to check self? - if (self instanceof PyUnicode||arg0 instanceof PyUnicode) { + if (self instanceof PyUnicode||(arg0.__len__() > 0 && arg0.__getitem__(0) instanceof PyUnicode)) { return new PyUnicode(result); } else { return new PyString(result); @@ -1625,7 +1625,8 @@ PyString self=(PyString)gself; String result=self.str_join(arg0); //XXX: do we really need to check self? - if (self instanceof PyUnicode||arg0 instanceof PyUnicode) { + System.out.println("INST CALL"); + if (self instanceof PyUnicode||(arg0.__len__() > 0 && arg0.__getitem__(0) instanceof PyUnicode)) { return new PyUnicode(result); } else { return new PyString(result); @@ -2926,7 +2927,6 @@ StringBuffer v = new StringBuffer(str.length()); char quote = 0; - boolean unicode = false; if (use_quotes) { quote = str.indexOf('\'') >= 0 && @@ -2940,13 +2940,32 @@ if (use_quotes && (ch == quote || ch == '\\')) { v.append('\\'); v.append((char) ch); + continue; } + /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */ + else if (ch >= 0xD800 && ch < 0xDC00) { + char ch2 = str.charAt(i++); + size--; + if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) { + int ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000; + v.append('\\'); + v.append('U'); + v.append(hexdigit[(ucs >> 28) & 0xf]); + v.append(hexdigit[(ucs >> 24) & 0xf]); + v.append(hexdigit[(ucs >> 20) & 0xf]); + v.append(hexdigit[(ucs >> 16) & 0xf]); + v.append(hexdigit[(ucs >> 12) & 0xf]); + v.append(hexdigit[(ucs >> 8) & 0xf]); + v.append(hexdigit[(ucs >> 4) & 0xf]); + v.append(hexdigit[ucs & 0xf]); + continue; + } + /* Fall through: isolated surrogates are copied as-is */ + i--; + size++; + } /* Map 16-bit characters to '\\uxxxx' */ - else if (ch >= 256) { - if (use_quotes && !unicode) { - v.insert(0, 'u'); - unicode = true; - } + if (ch >= 256) { v.append('\\'); v.append('u'); v.append(hexdigit[(ch >> 12) & 0xf]); @@ -2961,9 +2980,10 @@ else if (use_quotes && ch == '\f') v.append("\\f"); else if (use_quotes && ch == '\r') v.append("\\r"); else if (ch < ' ' || ch >= 127) { - v.append("\\x"); - v.append(hexdigit[(ch >> 4) & 0xF]); - v.append(hexdigit[ch & 0xF]); + v.append('\\'); + v.append('x'); + v.append(hexdigit[(ch >> 4) & 0xf]); + v.append(hexdigit[ch & 0xf]); } /* Copy everything else as-is */ else @@ -2976,177 +2996,265 @@ private static ucnhashAPI pucnHash = null; - public static String decode_UnicodeEscape(String str, int start, int end, - String errors, boolean unicode) - { - StringBuffer v = new StringBuffer(end-start); - for (int s = start; s < end; ) { + + public static String decode_UnicodeEscape(String str, + int start, + int end, + String errors, + boolean unicode) { + StringBuffer v = new StringBuffer(end - start); + for(int s = start; s < end;) { char ch = str.charAt(s); - /* Non-escape characters are interpreted as Unicode ordinals */ - if (ch != '\\') { + if(ch != '\\') { v.append(ch); s++; continue; } - + int loopStart = s; /* \ - Escapes */ s++; + if(s == end) { + s = codecs.insertReplacementAndGetResume(v, + errors, + "unicodeescape", + str, + loopStart, + s + 1, + "\\ at end of string"); + continue; + } ch = str.charAt(s++); - switch (ch) { - - /* \x escapes */ - case '\n': break; - case '\\': v.append('\\'); break; - case '\'': v.append('\''); break; - case '\"': v.append('\"'); break; - case 'b': v.append('\b'); break; - case 'f': v.append('\014'); break; /* FF */ - case 't': v.append('\t'); break; - case 'n': v.append('\n'); break; - case 'r': v.append('\r'); break; - case 'v': v.append('\013'); break; /* VT */ - case 'a': v.append('\007'); break; /* BEL, not classic C */ - - /* \OOO (octal) escapes */ - case '0': case '1': case '2': case '3': - case '4': case '5': case '6': case '7': - - int x = Character.digit(ch, 8); - for (int j = 0; j < 2 && s < end; j++, s++) { - ch = str.charAt(s); - if (ch < '0' || ch > '7') + switch(ch){ + /* \x escapes */ + case '\n': + break; + case '\\': + v.append('\\'); + break; + case '\'': + v.append('\''); + break; + case '\"': + v.append('\"'); + break; + case 'b': + v.append('\b'); + break; + case 'f': + v.append('\014'); + break; /* FF */ + case 't': + v.append('\t'); + break; + case 'n': + v.append('\n'); + break; + case 'r': + v.append('\r'); + break; + case 'v': + v.append('\013'); + break; /* VT */ + case 'a': + v.append('\007'); + break; /* BEL, not classic C */ + /* \OOO (octal) escapes */ + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + int x = Character.digit(ch, 8); + for(int j = 0; j < 2 && s < end; j++, s++) { + ch = str.charAt(s); + if(ch < '0' || ch > '7') + break; + x = (x << 3) + Character.digit(ch, 8); + } + v.append((char)x); + break; + case 'x': + s = hexescape(v, errors, 2, s, str, end, "truncated \\xXX"); + break; + case 'u': + if(!unicode) { + v.append('\\'); + v.append('u'); break; - x = (x<<3) + Character.digit(ch, 8); - } - v.append((char) x); - break; - - case 'x': - int i; - for (x = 0, i = 0; i < 2 && s < end; i++) { - ch = str.charAt(s + i); - int d = Character.digit(ch, 16); - if (d == -1) { - codecs.decoding_error("unicode escape", v, errors, - "truncated \\xXX"); - i++; + } + s = hexescape(v, + errors, + 4, + s, + str, + end, + "truncated \\uXXXX"); + break; + case 'U': + if(!unicode) { + v.append('\\'); + v.append('U'); break; } - - x = ((x<<4) & ~0xF) + d; - } - s += i; - v.append((char) x); - break; - - /* \ uXXXX with 4 hex digits */ - case 'u': - if (!unicode) { - v.append('\\'); - v.append('u'); + s = hexescape(v, + errors, + 8, + s, + str, + end, + "truncated \\UXXXXXXXX"); break; - } - if (s+4 > end) { - codecs.decoding_error("unicode escape", v, errors, - "truncated \\uXXXX"); - break; - } - for (x = 0, i = 0; i < 4; i++) { - ch = str.charAt(s + i); - int d = Character.digit(ch, 16); - if (d == -1) { - codecs.decoding_error("unicode escape", v, errors, - "truncated \\uXXXX"); + case 'N': + if(!unicode) { + v.append('\\'); + v.append('N'); break; } - x = ((x<<4) & ~0xF) + d; - } - s += i; - v.append((char) x); - break; - - case 'N': - if (!unicode) { - v.append('\\'); - v.append('N'); - break; - } - /* Ok, we need to deal with Unicode Character Names now, - * make sure we've imported the hash table data... - */ - if (pucnHash == null) { - PyObject mod = imp.importName("ucnhash", true); - mod = mod.__call__(); - pucnHash = (ucnhashAPI) mod.__tojava__(Object.class); - if (pucnHash.getCchMax() < 0) - codecs.decoding_error("unicode escape", v, errors, - "Unicode names not loaded"); - } - - if (str.charAt(s) == '{') { - int startName = s + 1; - int endBrace = startName; - - /* look for either the closing brace, or we - * exceed the maximum length of the unicode - * character names + /* + * Ok, we need to deal with Unicode Character Names now, + * make sure we've imported the hash table data... */ - int maxLen = pucnHash.getCchMax(); - while (endBrace < end && str.charAt(endBrace) != '}' - && (endBrace - startName) <= maxLen) { - endBrace++; + if(pucnHash == null) { + PyObject mod = imp.importName("ucnhash", true); + mod = mod.__call__(); + pucnHash = (ucnhashAPI)mod.__tojava__(Object.class); + if(pucnHash.getCchMax() < 0) + throw Py.UnicodeError("Unicode names not loaded"); } - if (endBrace != end && str.charAt(endBrace) == '}') { - int value = pucnHash.getValue(str, startName, - endBrace); - if (value < 0) { - codecs.decoding_error("unicode escape", v, - errors, "Invalid Unicode Character Name"); - v.append('\\'); - v.append(str.charAt(s-1)); - break; - } - - if (value < 1<<16) { - /* In UCS-2 range, easy solution.. */ - v.append((char) value); - } else { - /* Oops, its in UCS-4 space, */ - /* compute and append the two surrogates: */ - /* translate from 10000..10FFFF to 0..FFFFF */ - value -= 0x10000; - - /* high surrogate = top 10 bits added to D800 */ - v.append((char) (0xD800 + (value >> 10))); - - /* low surrogate = bottom 10 bits added to DC00*/ - v.append((char) (0xDC00 + (value & ~0xFC00))); + if(str.charAt(s) == '{') { + int startName = s + 1; + int endBrace = startName; + /* + * look for either the closing brace, or we exceed the + * maximum length of the unicode character names + */ + int maxLen = pucnHash.getCchMax(); + while(endBrace < end && str.charAt(endBrace) != '}' + && (endBrace - startName) <= maxLen) { + endBrace++; } - s = endBrace + 1; + if(endBrace != end && str.charAt(endBrace) == '}') { + int value = pucnHash.getValue(str, + startName, + endBrace); + if(storeUnicodeCharacter(value, v)) { + s = endBrace + 1; + } else { + s = codecs.insertReplacementAndGetResume(v, + errors, + "unicodeescape", + str, + loopStart, + endBrace + 1, + "illegal Unicode character"); + } + } else { + s = codecs.insertReplacementAndGetResume(v, + errors, + "unicodeescape", + str, + loopStart, + endBrace, + "malformed \\N character escape"); + } + break; } else { - codecs.decoding_error("unicode escape", v, errors, - "Unicode name missing closing brace"); - v.append('\\'); - v.append(str.charAt(s-1)); - break; + s = codecs.insertReplacementAndGetResume(v, + errors, + "unicodeescape", + str, + loopStart, + s + 1, + "malformed \\N character escape"); } break; - } - codecs.decoding_error("unicode escape", v, errors, - "Missing opening brace for Unicode " + - "Character Name escape"); + default: + v.append('\\'); + v.append(str.charAt(s - 1)); + break; + } + } + return v.toString(); + } - /* fall through on purpose */ - default: - v.append('\\'); - v.append(str.charAt(s-1)); - break; - } - } - return v.toString(); + private static int hexescape(StringBuffer partialDecode, + String errors, + int digits, + int hexDigitStart, + String str, + int size, + String errorMessage) { + if(hexDigitStart + digits > size) { + return codecs.insertReplacementAndGetResume(partialDecode, + errors, + "unicodeescape", + str, + hexDigitStart - 2, + size, + errorMessage); + } + int i = 0; + int x = 0; + for(; i < digits; ++i) { + char c = str.charAt(hexDigitStart + i); + int d = Character.digit(c, 16); + if(d == -1) { + return codecs.insertReplacementAndGetResume(partialDecode, + errors, + "unicodeescape", + str, + hexDigitStart - 2, + hexDigitStart + i + 1, + errorMessage); + } + x = (x << 4) & ~0xF; + if(c >= '0' && c <= '9') + x += c - '0'; + else if(c >= 'a' && c <= 'f') + x += 10 + c - 'a'; + else + x += 10 + c - 'A'; + } + if(storeUnicodeCharacter(x, partialDecode)) { + return hexDigitStart + i; + } else { + return codecs.insertReplacementAndGetResume(partialDecode, + errors, + "unicodeescape", + str, + hexDigitStart - 2, + hexDigitStart + i + 1, + "illegal Unicode character"); + } } + /*pass in an int since this can be a UCS-4 character */ + private static boolean storeUnicodeCharacter(int value, + StringBuffer partialDecode) { + if(value < 0) { + return false; + } else if(value < 1 << 16) { + /* In UCS-2 range, easy solution.. */ + partialDecode.append((char)value); + return true; + } else if(value <= 0x10ffff) { + /* Oops, its in UCS-4 space, */ + /* compute and append the two surrogates: */ + /* translate from 10000..10FFFF to 0..FFFFF */ + value -= 0x10000; + /* high surrogate = top 10 bits added to D800 */ + partialDecode.append((char)(0xD800 + (value >> 10))); + /* low surrogate = bottom 10 bits added to DC00 */ + partialDecode.append((char)(0xDC00 + (value & ~0xFC00))); + return true; + } + return false; + } + public boolean equals(Object other) { if (!(other instanceof PyString)) return false; @@ -3306,7 +3414,11 @@ for (int i=0; i 250){ + // A magic number. Larger than in CPython. + throw Py.OverflowError( + "formatted long is too long (precision too long?)"); + } String s = arg.toString(); int end = s.length(); int ptr = 0; @@ -4928,6 +5045,11 @@ } public String formatInteger(long v, int radix, boolean unsigned) { + if(precision > 250){ + // A magic number. Larger than in CPython. + throw Py.OverflowError( + "formatted integer is too long (precision too long?)"); + } if (unsigned) { if (v < 0) v = 0x100000000l + v; @@ -4949,6 +5071,11 @@ } public String formatFloatDecimal(double v, boolean truncate) { + if(precision > 250) { + // A magic number. Larger than in CPython. + throw Py.OverflowError( + "formatted float is too long (precision too long?)"); + } java.text.NumberFormat format = java.text.NumberFormat.getInstance( java.util.Locale.US); int prec = precision; @@ -5087,11 +5214,6 @@ precision = getNumber(); if (precision < -1) precision = 0; - if (precision > 250) { - // A magic number. Larger than in CPython. - throw Py.OverflowError( - "formatted float is too long (precision too long?)"); - } c = pop(); } @@ -5111,7 +5233,6 @@ fill = '0'; else fill = ' '; - switch(c) { case 's': case 'r': Index: src/org/python/core/exceptions.java =================================================================== --- src/org/python/core/exceptions.java (revision 2833) +++ src/org/python/core/exceptions.java (working copy) @@ -64,6 +64,10 @@ + " | +-- ValueError\n" + " | | |\n" + " | | +-- UnicodeError\n" + + " | | |\n" + + " | | +-- UnicodeEncodeError\n" + + " | | +-- UnicodeDecodeError\n" + + " | | +-- UnicodeTranslateError\n" + " | |\n" + " | +-- ReferenceError\n" + " | +-- SystemError\n" @@ -158,6 +162,15 @@ buildClass(dict, "UnicodeError", "ValueError", "empty__init__", "Unicode related error."); + buildClass(dict, "UnicodeEncodeError", "UnicodeError", "UnicodeEncodeError", + "Unicode encoding error."); + + buildClass(dict, "UnicodeDecodeError", "UnicodeError", "UnicodeDecodeError", + "Unicode decoding error."); + + buildClass(dict, "UnicodeTranslateError", "UnicodeError", "UnicodeTranslateError", + "Unicode translation error."); + buildClass(dict, "KeyboardInterrupt", "StandardError", "empty__init__", "Program interrupted by user."); @@ -213,6 +226,9 @@ buildClass(dict, "DeprecationWarning", "Warning", "empty__init__", "Base class for warnings about deprecated features."); + + buildClass(dict, "PendingDeprecationWarning", "Warning", "empty__init__", + "Base class for warnings about features which will be deprecated in the future."); buildClass(dict, "SyntaxWarning", "Warning", "empty__init__", "Base class for warnings about dubious syntax."); @@ -222,6 +238,9 @@ buildClass(dict, "OverflowWarning", "Warning", "empty__init__", "Base class for warnings about numeric overflow."); + + buildClass(dict, "FutureWarning", "Warning", "empty__init__", + "Base class for warnings about constructs that will change semantically in the future."); ts.frame = ts.frame.f_back; } @@ -308,7 +327,7 @@ } public static PyString SyntaxError__str__(PyObject[] arg, String[] kws) { - ArgParser ap = new ArgParser("__init__", arg, kws, "self", "args"); + ArgParser ap = new ArgParser("__str__", arg, kws, "self", "args"); PyObject self = ap.getPyObject(0); PyString str = self.__getattr__("msg").__str__(); PyObject filename = basename(self.__findattr__("filename")); @@ -376,7 +395,7 @@ } public static PyString EnvironmentError__str__(PyObject[] arg, String[] kws) { - ArgParser ap = new ArgParser("__init__", arg, kws, "self"); + ArgParser ap = new ArgParser("__str__", arg, kws, "self"); PyObject self = ap.getPyObject(0); if (self.__getattr__("filename") != Py.None) { @@ -415,6 +434,176 @@ } } + public static PyObject UnicodeError(PyObject[] arg, String[] kws) { + PyObject dict = empty__init__(arg, kws); + dict.__setitem__("__init__", getJavaFunc("UnicodeError__init__")); + return dict; + } + + public static void UnicodeError__init__(PyObject[] arg, String[] kws, PyObject objectType) { + ArgParser ap = new ArgParser("__init__", arg, kws, "self", "args"); + PyObject self = ap.getPyObject(0); + PyObject args = ap.getList(1); + self.__setattr__("args", args); + if(args.__len__() != 5){ + throw Py.TypeError(""); + } + if(!isPyString(args.__getitem__(0)) + || !Py.isInstance(args.__getitem__(1), objectType)|| + !isPyInt(args.__getitem__(2)) || + !isPyInt(args.__getitem__(3)) + || !isPyString(args.__getitem__(4))){ + throw Py.TypeError(""); + } + self.__setattr__("encoding", args.__getitem__(0)); + self.__setattr__("object", args.__getitem__(1)); + self.__setattr__("start", args.__getitem__(2)); + self.__setattr__("end", args.__getitem__(3)); + self.__setattr__("reason", args.__getitem__(4)); + } + + private static boolean isPyInt(PyObject object) { + return Py.isInstance(object, PyType.fromClass(PyInteger.class)); + } + + private static boolean isPyString(PyObject item) { + return Py.isInstance(item, PyType.fromClass(PyString.class)); + } + + public static void UnicodeDecodeError__init__(PyObject[] arg, String[] kws) { + UnicodeError__init__(arg, kws, PyType.fromClass(PyString.class)); + } + + public static PyString UnicodeDecodeError__str__(PyObject[] arg, String[] kws) { + ArgParser ap = new ArgParser("__str__", arg, kws, "self"); + PyObject self = ap.getPyObject(0); + int start = ((PyInteger)self.__getattr__("start")).getValue(); + int end = ((PyInteger)self.__getattr__("end")).getValue(); + + if(end == (start + 1)) { + PyInteger badByte = new PyInteger((int)(self.__getattr__("object") + .toString().charAt(start)) & 0xff); + return Py.newString("'%.400s' codec can't decode byte 0x%02x in position %d: %.400s") + .__mod__(new PyTuple(new PyObject[] {self.__getattr__("encoding"), + badByte, + self.__getattr__("start"), + self.__getattr__("reason")})) + .__str__(); + } else { + return Py.newString("'%.400s' codec can't decode bytes in position %d-%d: %.400s") + .__mod__(new PyTuple(new PyObject[] {self.__getattr__("encoding"), + self.__getattr__("start"), + new PyInteger(end - 1), + self.__getattr__("reason")})) + .__str__(); + } + } + + public static PyObject UnicodeDecodeError(PyObject[] arg, String[] kws) { + PyObject dict = empty__init__(arg, kws); + dict.__setitem__("__init__", getJavaFunc("UnicodeDecodeError__init__")); + dict.__setitem__("__str__", getJavaFunc("UnicodeDecodeError__str__")); + return dict; + } + + public static void UnicodeEncodeError__init__(PyObject[] arg, String[] kws) { + UnicodeError__init__(arg, kws, PyType.fromClass(PyBaseString.class)); + } + + public static PyString UnicodeEncodeError__str__(PyObject[] arg, String[] kws) { + ArgParser ap = new ArgParser("__str__", arg, kws, "self"); + PyObject self = ap.getPyObject(0); + int start = ((PyInteger)self.__getattr__("start")).getValue(); + int end = ((PyInteger)self.__getattr__("end")).getValue(); + + if(end == (start + 1)) { + int badchar = (int)(self.__getattr__("object").toString().charAt(start)); + String format; + if(badchar <= 0xff) + format = "'%.400s' codec can't encode character u'\\x%02x' in position %d: %.400s"; + else if(badchar <= 0xffff) + format = "'%.400s' codec can't encode character u'\\u%04x' in position %d: %.400s"; + else + format = "'%.400s' codec can't encode character u'\\U%08x' in position %d: %.400s"; + return Py.newString(format) + .__mod__(new PyTuple(new PyObject[] {self.__getattr__("encoding"), + new PyInteger(badchar), + self.__getattr__("start"), + self.__getattr__("reason")})) + .__str__(); + } else { + return Py.newString("'%.400s' codec can't encode characters in position %d-%d: %.400s") + .__mod__(new PyTuple(new PyObject[] {self.__getattr__("encoding"), + self.__getattr__("start"), + new PyInteger(end - 1), + self.__getattr__("reason")})) + .__str__(); + } + } + + public static PyObject UnicodeEncodeError(PyObject[] arg, String[] kws) { + PyObject dict = empty__init__(arg, kws); + dict.__setitem__("__init__", getJavaFunc("UnicodeEncodeError__init__")); + dict.__setitem__("__str__", getJavaFunc("UnicodeEncodeError__str__")); + return dict; + } + + public static void UnicodeTranslateError__init__(PyObject[] arg, String[] kws) { + ArgParser ap = new ArgParser("__init__", arg, kws, "self", "args"); + PyObject self = ap.getPyObject(0); + PyObject args = ap.getList(1); + if(args.__len__() != 4){ + throw Py.TypeError(""); + } + if(!Py.isInstance(args.__getitem__(0), PyType.fromClass(PyBaseString.class))|| + !isPyInt(args.__getitem__(1)) || + !isPyInt(args.__getitem__(2)) + || !isPyString(args.__getitem__(3))){ + throw Py.TypeError(""); + } + self.__setattr__("args", args); + self.__setattr__("object", args.__getitem__(0)); + self.__setattr__("start", args.__getitem__(1)); + self.__setattr__("end", args.__getitem__(2)); + self.__setattr__("reason", args.__getitem__(3)); + } + + public static PyString UnicodeTranslateError__str__(PyObject[] arg, String[] kws) { + ArgParser ap = new ArgParser("__str__", arg, kws, "self"); + PyObject self = ap.getPyObject(0); + int start = ((PyInteger)self.__getattr__("start")).getValue(); + int end = ((PyInteger)self.__getattr__("end")).getValue(); + + if(end == (start + 1)) { + int badchar = (int)(self.__getattr__("object").toString().charAt(start)); + String format; + if(badchar <= 0xff) + format = "can't translate character u'\\x%02x' in position %d: %.400s"; + else if(badchar <= 0xffff) + format = "can't translate character u'\\u%04x' in position %d: %.400s"; + else + format = "can't translate character u'\\U%08x' in position %d: %.400s"; + return Py.newString(format) + .__mod__(new PyTuple(new PyObject[] {new PyInteger(badchar), + self.__getattr__("start"), + self.__getattr__("reason")})) + .__str__(); + } else { + return Py.newString("can't translate characters in position %d-%d: %.400s") + .__mod__(new PyTuple(new PyObject[] {self.__getattr__("start"), + new PyInteger(end - 1), + self.__getattr__("reason")})) + .__str__(); + } + } + + public static PyObject UnicodeTranslateError(PyObject[] arg, String[] kws) { + PyObject dict = empty__init__(arg, kws); + dict.__setitem__("__init__", getJavaFunc("UnicodeTranslateError__init__")); + dict.__setitem__("__str__", getJavaFunc("UnicodeTranslateError__str__")); + return dict; + } + private static PyObject getJavaFunc(String name) { return Py.newJavaFunc(exceptions.class, name); } Index: src/org/python/core/Py.java =================================================================== --- src/org/python/core/Py.java (revision 2833) +++ src/org/python/core/Py.java (working copy) @@ -229,6 +229,49 @@ return new PyException(Py.UnicodeError, message); } + public static PyObject UnicodeTranslateError; + public static PyException UnicodeTranslateError(String object, + int start, + int end, + String reason) { + return new PyException(Py.UnicodeTranslateError, + new PyTuple(new PyObject[] { + new PyString(object), + new PyInteger(start), + new PyInteger(end), + new PyString(reason)})); + } + + public static PyObject UnicodeDecodeError; + + public static PyException UnicodeDecodeError(String encoding, + String object, + int start, + int end, + String reason) { + return new PyException(Py.UnicodeDecodeError, + new PyTuple(new PyObject[] {new PyString(encoding), + new PyString(object), + new PyInteger(start), + new PyInteger(end), + new PyString(reason)})); + } + + public static PyObject UnicodeEncodeError; + + public static PyException UnicodeEncodeError(String encoding, + String object, + int start, + int end, + String reason) { + return new PyException(Py.UnicodeEncodeError, + new PyTuple(new PyObject[] {new PyString(encoding), + new PyString(object), + new PyInteger(start), + new PyInteger(end), + new PyString(reason)})); + } + public static PyObject EOFError; public static PyException EOFError(String message) { return new PyException(Py.EOFError, message); @@ -274,6 +317,11 @@ public static void DeprecationWarning(String message) { warning(DeprecationWarning, message); } + + public static PyObject PendingDeprecationWarning; + public static void PendingDeprecationWarning(String message) { + warning( PendingDeprecationWarning, message); + } public static PyObject SyntaxWarning; public static void SyntaxWarning(String message) { @@ -289,6 +337,11 @@ public static void RuntimeWarning(String message) { warning(RuntimeWarning, message); } + + public static PyObject FutureWarning; + public static void FutureWarning(String message) { + warning(FutureWarning, message); + } private static PyObject warnings_mod; private static PyObject importWarnings() { @@ -611,15 +664,20 @@ FloatingPointError = initExc("FloatingPointError", exc, dict); ValueError = initExc("ValueError", exc, dict); UnicodeError = initExc("UnicodeError", exc, dict); + UnicodeEncodeError = initExc("UnicodeEncodeError", exc, dict); + UnicodeDecodeError = initExc("UnicodeDecodeError", exc, dict); + UnicodeTranslateError = initExc("UnicodeTranslateError", exc, dict); ReferenceError = initExc("ReferenceError", exc, dict); SystemError = initExc("SystemError", exc, dict); MemoryError = initExc("MemoryError", exc, dict); Warning = initExc("Warning", exc, dict); UserWarning = initExc("UserWarning", exc, dict); DeprecationWarning = initExc("DeprecationWarning", exc, dict); + PendingDeprecationWarning = initExc("PendingDeprecationWarning", exc, dict); SyntaxWarning = initExc("SyntaxWarning", exc, dict); OverflowWarning = initExc("OverflowWarning", exc, dict); RuntimeWarning = initExc("RuntimeWarning", exc, dict); + FutureWarning = initExc("FutureWarning", exc, dict); } public static PySystemState defaultSystemState; Index: src/org/python/modules/_codecs.java =================================================================== --- src/org/python/modules/_codecs.java (revision 2833) +++ src/org/python/modules/_codecs.java (working copy) @@ -10,9 +10,12 @@ import org.python.core.Py; import org.python.core.PyInteger; +import org.python.core.PyNone; import org.python.core.PyObject; import org.python.core.PyString; +import org.python.core.PySystemState; import org.python.core.PyTuple; +import org.python.core.PyUnicode; import org.python.core.codecs; public class _codecs { @@ -26,10 +29,22 @@ return codecs.lookup(encoding); } + public static PyObject lookup_error(String handlerName) { + return codecs.lookup_error(handlerName); + } + public static void register_error(String name, PyObject errorHandler) { + codecs.register_error(name, errorHandler); + } + private static PyTuple decode_tuple(String s, int len) { + return new PyTuple(new PyObject[] { + new PyUnicode(s), + Py.newInteger(len) + }); + } - private static PyTuple codec_tuple(String s, int len) { + private static PyTuple encode_tuple(String s, int len) { return new PyTuple(new PyObject[] { Py.java2py(s), Py.newInteger(len) @@ -45,7 +60,7 @@ public static PyTuple utf_8_decode(String str, String errors) { int size = str.length(); - return codec_tuple(codecs.PyUnicode_DecodeUTF8(str, errors), size); + return decode_tuple(codecs.PyUnicode_DecodeUTF8(str, errors), size); } @@ -55,55 +70,115 @@ public static PyTuple utf_8_encode(String str, String errors) { int size = str.length(); - return codec_tuple(codecs.PyUnicode_EncodeUTF8(str, errors), size); + return encode_tuple(codecs.PyUnicode_EncodeUTF8(str, errors), size); } + /* --- UTF-7 Codec --------------------------------------------------- */ + public static PyTuple utf_7_decode(String str) { + return utf_7_decode(str, null); + } + + public static PyTuple utf_7_decode(String str, String errors) { + int size = str.length(); + return decode_tuple(codecs.PyUnicode_DecodeUTF7(str, errors), size); + } + + + public static PyTuple utf_7_encode(String str) { + return utf_7_encode(str, null); + } + + public static PyTuple utf_7_encode(String str, String errors) { + int size = str.length(); + return encode_tuple(codecs.PyUnicode_EncodeUTF7(str, false, false, errors), size); + } + + public static PyTuple escape_decode(String str){ + return escape_decode(str, null); + } + + public static PyTuple escape_decode(String str, String errors) { + return decode_tuple(PyString.decode_UnicodeEscape(str, + 0, + str.length(), + errors, + true), str.length()); + } + + public static PyTuple escape_encode(String str){ + return escape_encode(str, null); + } + + public static PyTuple escape_encode(String str, String errors) { + return encode_tuple(PyString.encode_UnicodeEscape(str, false), + str.length()); + + } + /* --- Character Mapping Codec --------------------------------------- */ - public static PyTuple charmap_decode(String str, String errors, + public static PyTuple charmap_decode(String str, + String errors, PyObject mapping) { + return charmap_decode(str, errors, mapping, false); + } + + public static PyTuple charmap_decode(String str, + String errors, + PyObject mapping, boolean ignoreUnmapped) { + + int size = str.length(); StringBuffer v = new StringBuffer(size); - - for (int i = 0; i < size; i++) { + for(int i = 0; i < size; i++) { char ch = str.charAt(i); - if (ch > 0xFF) { - codecs.decoding_error("charmap", v, errors, - "ordinal not in range(255)"); - i++; + if(ch > 0xFF) { + i = codecs.insertReplacementAndGetResume(v, + errors, + "charmap", + str, + i, + i + 1, + "ordinal not in range(255)") - 1; continue; } - PyObject w = Py.newInteger(ch); PyObject x = mapping.__finditem__(w); - if (x == null) { - /* No mapping found: default to Latin-1 mapping if possible */ - v.append(ch); + if(x == null) { + if(ignoreUnmapped){ + v.append(ch); + }else{ +i = codecs.insertReplacementAndGetResume(v, errors, "charmap", str, i, i + 1, "no mapping found") - 1; + } continue; } - /* Apply mapping */ - if (x instanceof PyInteger) { - int value = ((PyInteger) x).getValue(); - if (value < 0 || value > 65535) - throw Py.TypeError( - "character mapping must be in range(65535)"); - v.append((char) value); - } else if (x == Py.None) { - codecs.decoding_error("charmap", v, errors, - "character maps to "); - } else if (x instanceof PyString) { + if(x instanceof PyInteger) { + int value = ((PyInteger)x).getValue(); + if(value < 0 || value > PySystemState.maxunicode) { + throw Py.TypeError("character mapping must return " + + "integer greater than 0 and less than sys.maxunicode"); + } + v.append((char)value); + } else if(x == Py.None) { + i = codecs.insertReplacementAndGetResume(v, + errors, + "charmap", + str, + i, + i + 1, + "character maps to ") - 1; + } else if(x instanceof PyString) { v.append(x.toString()); - } - else { + } else { /* wrong return value */ - throw Py.TypeError("character mapping must return integer, " + - "None or unicode"); + throw Py.TypeError("character mapping must return " + + "integer, None or str"); } } - return codec_tuple(v.toString(), size); + return decode_tuple(v.toString(), size); } @@ -112,54 +187,93 @@ public static PyTuple charmap_encode(String str, String errors, PyObject mapping) { + //Default to Latin-1 + if(mapping == null){ + return latin_1_encode(str, errors); + } + return charmap_encode_internal(str, errors, mapping, new StringBuffer(str.length()), true); + } + + private static PyTuple charmap_encode_internal(String str, + String errors, + PyObject mapping, + StringBuffer v, + boolean letLookupHandleError) { int size = str.length(); - StringBuffer v = new StringBuffer(size); - - for (int i = 0; i < size; i++) { + for(int i = 0; i < size; i++) { char ch = str.charAt(i); PyObject w = Py.newInteger(ch); PyObject x = mapping.__finditem__(w); - if (x == null) { - /* No mapping found: default to Latin-1 mapping if possible */ - if (ch < 256) - v.append(ch); - else - codecs.encoding_error("charmap", v, errors, - "missing character mapping"); - continue; - } - if (x instanceof PyInteger) { - int value = ((PyInteger) x).getValue(); - if (value < 0 || value > 255) - throw Py.TypeError( - "character mapping must be in range(256)"); - v.append((char) value); - } else if (x == Py.None) { - codecs.encoding_error("charmap", v, errors, - "character maps to "); - } else if (x instanceof PyString) { + if(x == null) { + if(letLookupHandleError) { + i = handleBadMapping(str, errors, mapping, v, size, i); + } else { + throw Py.UnicodeEncodeError("charmap", + str, + i, + i + 1, + "character maps to "); + } + }else + if(x instanceof PyInteger) { + int value = ((PyInteger)x).getValue(); + if(value < 0 || value > 255) + throw Py.TypeError("character mapping must be in range(256)"); + v.append((char)value); + } else if(x instanceof PyString && !(x instanceof PyUnicode)) { v.append(x.toString()); - } - else { + } else if(x instanceof PyNone){ + i = handleBadMapping(str, errors, mapping, v, size, i); + }else { /* wrong return value */ - throw Py.TypeError("character mapping must return " + - "integer, None or unicode"); + throw Py.TypeError("character mapping must return " + + "integer, None or str"); } } - return codec_tuple(v.toString(), size); + return encode_tuple(v.toString(), size); } + private static int handleBadMapping(String str, + String errors, + PyObject mapping, + StringBuffer v, + int size, + int i) { + if(errors != null) { + if(errors.equals(codecs.IGNORE)) { + return i; + } else if(errors.equals(codecs.REPLACE)) { + charmap_encode_internal("?", errors, mapping, v, false); + return i; + } else if(errors.equals(codecs.XMLCHARREFREPLACE)) { + charmap_encode_internal(codecs.xmlcharrefreplace(i, i + 1, str) + .toString(), errors, mapping, v, false); + return i; + } else if(errors.equals(codecs.BACKSLASHREPLACE)) { + charmap_encode_internal(codecs.backslashreplace(i, i + 1, str) + .toString(), errors, mapping, v, false); + return i; + } + } + PyObject replacement = codecs.encoding_error(errors, + "charmap", + str, + i, + i + 1, + "character maps to "); + String replStr = replacement.__getitem__(0).toString(); + charmap_encode_internal(replStr, errors, mapping, v, false); + return codecs.calcNewPosition(size, replacement) - 1; + } - /* --- 7-bit ASCII Codec -------------------------------------------- */ - public static PyTuple ascii_decode(String str) { return ascii_decode(str, null); } public static PyTuple ascii_decode(String str, String errors) { int size = str.length(); - return codec_tuple(codecs.PyUnicode_DecodeASCII(str, size, errors), + return decode_tuple(codecs.PyUnicode_DecodeASCII(str, size, errors), size); } @@ -170,7 +284,7 @@ public static PyTuple ascii_encode(String str, String errors) { int size = str.length(); - return codec_tuple(codecs.PyUnicode_EncodeASCII(str, size, errors), + return encode_tuple(codecs.PyUnicode_EncodeASCII(str, size, errors), size); } @@ -183,21 +297,8 @@ public static PyTuple latin_1_decode(String str, String errors) { int size = str.length(); - StringBuffer v = new StringBuffer(size); - - for (int i = 0; i < size; i++) { - char ch = str.charAt(i); - if (ch < 256) { - v.append(ch); - } else { - codecs.decoding_error("latin-1", v, errors, - "ordinal not in range(256)"); - i++; - continue; - } - } - - return codec_tuple(v.toString(), size); + return decode_tuple(codecs.PyUnicode_DecodeLatin1(str, size, errors), + size); } @@ -207,17 +308,7 @@ public static PyTuple latin_1_encode(String str, String errors) { int size = str.length(); - StringBuffer v = new StringBuffer(size); - - for (int i = 0; i < size; i++) { - char ch = str.charAt(i); - if (ch >= 256) { - codecs.encoding_error("latin-1", v, errors, - "ordinal not in range(256)"); - } else - v.append(ch); - } - return codec_tuple(v.toString(), size); + return encode_tuple(codecs.PyUnicode_EncodeLatin1(str, size, errors), size); } @@ -229,12 +320,12 @@ } public static PyTuple utf_16_encode(String str, String errors) { - return codec_tuple(encode_UTF16(str, errors, 0), str.length()); + return encode_tuple(encode_UTF16(str, errors, 0), str.length()); } public static PyTuple utf_16_encode(String str, String errors, int byteorder) { - return codec_tuple(encode_UTF16(str, errors, byteorder), + return encode_tuple(encode_UTF16(str, errors, byteorder), str.length()); } @@ -243,7 +334,7 @@ } public static PyTuple utf_16_le_encode(String str, String errors) { - return codec_tuple(encode_UTF16(str, errors, -1), str.length()); + return encode_tuple(encode_UTF16(str, errors, -1), str.length()); } public static PyTuple utf_16_be_encode(String str) { @@ -251,7 +342,7 @@ } public static PyTuple utf_16_be_encode(String str, String errors) { - return codec_tuple(encode_UTF16(str, errors, 1), str.length()); + return encode_tuple(encode_UTF16(str, errors, 1), str.length()); } @@ -291,14 +382,13 @@ } public static PyTuple utf_16_decode(String str, String errors) { - int[] bo = new int[] { 0 }; - return codec_tuple(decode_UTF16(str, errors, bo), str.length()); + return utf_16_decode(str, errors, 0); } public static PyTuple utf_16_decode(String str, String errors, int byteorder) { int[] bo = new int[] { byteorder }; - return codec_tuple(decode_UTF16(str, errors, bo), str.length()); + return decode_tuple(decode_UTF16(str, errors, bo), str.length()); } public static PyTuple utf_16_le_decode(String str) { @@ -307,7 +397,7 @@ public static PyTuple utf_16_le_decode(String str, String errors) { int[] bo = new int[] { -1 }; - return codec_tuple(decode_UTF16(str, errors, bo), str.length()); + return decode_tuple(decode_UTF16(str, errors, bo), str.length()); } public static PyTuple utf_16_be_decode(String str) { @@ -316,7 +406,7 @@ public static PyTuple utf_16_be_decode(String str, String errors) { int[] bo = new int[] { 1 }; - return codec_tuple(decode_UTF16(str, errors, bo), str.length()); + return decode_tuple(decode_UTF16(str, errors, bo), str.length()); } public static PyTuple utf_16_ex_decode(String str) { @@ -338,67 +428,70 @@ }); } - private static String decode_UTF16(String str, String errors, + private static String decode_UTF16(String str, + String errors, int[] byteorder) { int bo = 0; - if (byteorder != null) - bo = byteorder[0]; - + if(byteorder != null) + bo = byteorder[0]; int size = str.length(); - - if (size % 2 != 0) - codecs.decoding_error("UTF16", null, errors, "truncated data"); - - StringBuffer v = new StringBuffer(size/2); - - for (int i = 0; i < size; i += 2) { + StringBuffer v = new StringBuffer(size / 2); + for(int i = 0; i < size; i += 2) { char ch1 = str.charAt(i); - char ch2 = str.charAt(i+1); - if (ch1 == 0xFE && ch2 == 0xFF) { + if(i + 1 == size) { + i = codecs.insertReplacementAndGetResume(v, + errors, + "utf-16", + str, + i, + i + 1, + "truncated data"); + continue; + } + char ch2 = str.charAt(i + 1); + if(ch1 == 0xFE && ch2 == 0xFF) { bo = 1; continue; - } else if (ch1 == 0xFF && ch2 == 0xFE) { + } else if(ch1 == 0xFF && ch2 == 0xFE) { bo = -1; continue; } - char ch; - if (bo == -1) - ch = (char) (ch2 << 8 | ch1); + if(bo == -1) + ch = (char)(ch2 << 8 | ch1); else - ch = (char) (ch1 << 8 | ch2); - - if (ch < 0xD800 || ch > 0xDFFF) { + ch = (char)(ch1 << 8 | ch2); + if(ch < 0xD800 || ch > 0xDFFF) { v.append(ch); continue; } - - - /* UTF-16 code pair: */ - if (i == size-1) { - codecs.decoding_error("UTF-16", v, errors, - "unexpected end of data"); - continue; - } - ch = str.charAt(++i); - if (0xDC00 <= ch && ch <= 0xDFFF) { - ch = str.charAt(++i); - if (0xD800 <= ch && ch <= 0xDBFF) - /* This is valid data (a UTF-16 surrogate pair), but - we are not able to store this information since our - Py_UNICODE type only has 16 bits... this might - change someday, even though it's unlikely. */ - codecs.decoding_error("UTF-16", v, errors, - "code pairs are not supported"); + if(0xDC00 <= ch && ch <= 0xDFFF) { + ch2 = str.charAt(++i); + if(0xD800 <= ch2 && ch2 <= 0xDBFF) { + v.append(ch); + v.append(ch2); + continue; + } + i = codecs.insertReplacementAndGetResume(v, + errors, + "utf-16", + str, + i, + i + 1, + "illegal UTF-16 surrogate"); continue; } - codecs.decoding_error("UTF-16", v, errors, "illegal encoding"); + i = codecs.insertReplacementAndGetResume(v, + errors, + "utf-16", + str, + i, + i + 1, + "illegal encoding"); } - - if (byteorder != null) + if(byteorder != null) byteorder[0] = bo; - return v.toString(); } @@ -413,7 +506,7 @@ public static PyTuple raw_unicode_escape_encode(String str, String errors) { - return codec_tuple(codecs.PyUnicode_EncodeRawUnicodeEscape(str, + return encode_tuple(codecs.PyUnicode_EncodeRawUnicodeEscape(str, errors, false), str.length()); } @@ -425,7 +518,7 @@ public static PyTuple raw_unicode_escape_decode(String str, String errors) { - return codec_tuple(codecs.PyUnicode_DecodeRawUnicodeEscape(str, + return decode_tuple(codecs.PyUnicode_DecodeRawUnicodeEscape(str, errors), str.length()); } @@ -440,7 +533,7 @@ } public static PyTuple unicode_escape_encode(String str, String errors) { - return codec_tuple(PyString.encode_UnicodeEscape(str, false), + return encode_tuple(PyString.encode_UnicodeEscape(str, false), str.length()); } @@ -450,12 +543,13 @@ public static PyTuple unicode_escape_decode(String str, String errors) { int n = str.length(); - return codec_tuple(PyString.decode_UnicodeEscape(str, - 0, n, errors, true), n); + return decode_tuple(PyString.decode_UnicodeEscape(str, + 0, + n, + errors, + true), n); } - - /* --- UnicodeInternal Codec ------------------------------------------ */ @@ -464,7 +558,7 @@ } public static PyTuple unicode_internal_encode(String str, String errors) { - return codec_tuple(str, str.length()); + return encode_tuple(str, str.length()); } public static PyTuple unicode_internal_decode(String str) { @@ -472,7 +566,7 @@ } public static PyTuple unicode_internal_decode(String str, String errors) { - return codec_tuple(str, str.length()); + return decode_tuple(str, str.length()); } }