diff -r 6cee6fef06f0 Lib/encodings/shift_jis.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Lib/encodings/shift_jis.py Thu Jun 12 15:48:56 2014 -0600 @@ -0,0 +1,229 @@ +import codecs + +from array import array +from java.lang import StringBuilder +from java.nio import ByteBuffer, CharBuffer +from java.nio.charset import Charset +from StringIO import StringIO + + +def _process_decode_errors(decoder, input, result, error_function, input_buffer, builder): + if result.isError(): + e = UnicodeDecodeError( + str(decoder.charset().name()), + input, + input_buffer.position(), + input_buffer.position() + result.length(), + 'illegal multibyte sequence') + replacement, pos = error_function(e) + if not isinstance(replacement, unicode): + raise TypeError() + #print "Replacement", e, replacement, pos + pos = int(pos) + if pos < 0: + pos = input_buffer.limit() + pos + if pos > input_buffer.limit(): + raise IndexError() + builder.append(replacement) + input_buffer.position(pos) + + +def _process_incomplete_decode(decoder, input, error_function, input_buffer, builder): + if input_buffer.position() < input_buffer.limit(): + print "incomplete_decode", input_buffer, input_buffer.position(), input_buffer.limit() + e = UnicodeDecodeError( + str(decoder.charset().name()), + input, + input_buffer.position(), + input_buffer.limit(), + 'illegal multibyte sequence') + replacement, pos = error_function(e) + if not isinstance(replacement, unicode): + raise TypeError() + #print "Replacement", e, replacement, pos + pos = int(pos) + if pos < 0: + pos = input_buffer.limit() + pos + if pos > input_buffer.limit(): + raise IndexError() + builder.append(replacement) + input_buffer.position(pos) + + +def decode(input, errors='strict', final=True): + error_function = codecs.lookup_error(errors) + input_buffer = ByteBuffer.wrap(array('b', input)) + decoder = Charset.forName('Shift_JIS').newDecoder() + output_buffer = CharBuffer.allocate(min(max(int(len(input) / 2), 256), 1024)) + builder = StringBuilder(int(decoder.averageCharsPerByte() * len(input))) + + while True: + result = decoder.decode(input_buffer, output_buffer, False) + pos = output_buffer.position() + output_buffer.rewind() + builder.append(output_buffer.subSequence(0, pos)) + if result.isUnderflow(): + if final: + _process_incomplete_decode(decoder, input, error_function, input_buffer, builder) + break + _process_decode_errors(decoder, input, result, error_function, input_buffer, builder) + + return builder.toString(), input_buffer.position() + + +def _get_unicode(input_buffer, result): + return input_buffer.subSequence(0, result.length()).toString() + + +def _process_encode_errors(encoder, input, result, error_function, input_buffer, builder): + if result.isError(): + #print "Processing encode error", result + e = UnicodeEncodeError( + str(encoder.charset().name()), + input, + input_buffer.position(), + input_buffer.position() + result.length(), + 'illegal multibyte sequence') + replacement, pos = error_function(e) + if not isinstance(replacement, unicode): + raise TypeError() + #print "Replacement", e, replacement, pos + pos = int(pos) + if pos < 0: + pos = input_buffer.limit() + pos + if pos > input_buffer.limit(): + raise IndexError() + builder.write(str(replacement)) + input_buffer.position(pos) + #print "input_buffer", input_buffer + + +def encode(input, errors='strict'): + error_function = codecs.lookup_error(errors) + # workaround non-BMP issues - need to get the exact count of chars, not codepoints + input_buffer = CharBuffer.allocate(StringBuilder(input).length()) + input_buffer.put(input) + input_buffer.rewind() + encoder = Charset.forName('Shift_JIS').newEncoder() + output_buffer = ByteBuffer.allocate(min(max(len(input) * 2, 256), 1024)) + builder = StringIO() + + while True: + result = encoder.encode(input_buffer, output_buffer, True) + pos = output_buffer.position() + output_buffer.rewind() + builder.write(output_buffer.array()[0:pos].tostring()) + if result.isUnderflow(): + break + _process_encode_errors(encoder, input, result, error_function, input_buffer, builder) + + return builder.getvalue(), len(input) + + +### Codec APIs + +class Codec(codecs.Codec): + + def encode(self, input, errors='strict'): + return encode(input, errors) + + def decode(self, input, errors='strict'): + return decode(input, errors) + + +class NonfinalCodec(codecs.Codec): + + def encode(self, input, errors='strict'): + return encode(input, errors) + + def decode(self, input, errors='strict'): + return decode(input, errors, final=False) + + +class IncrementalEncoder(codecs.IncrementalEncoder): + + def __init__(self, errors='strict'): + self.errors = errors + self.encoder = Charset.forName('Shift_JIS').newEncoder() + self.output_buffer = ByteBuffer.allocate(1024) + + def encode(self, input, final=False): + error_function = codecs.lookup_error(self.errors) + # workaround non-BMP issues - need to get the exact count of chars, not codepoints + input_buffer = CharBuffer.allocate(StringBuilder(input).length()) + input_buffer.put(input) + input_buffer.rewind() + self.output_buffer.rewind() + builder = StringIO() + + while True: + result = self.encoder.encode(input_buffer, self.output_buffer, final) + pos = self.output_buffer.position() + self.output_buffer.rewind() + builder.write(self.output_buffer.array()[0:pos].tostring()) + if result.isUnderflow(): + break + _process_encode_errors(self.encoder, input, result, error_function, input_buffer, builder) + + return builder.getvalue() + +class IncrementalDecoder(codecs.IncrementalDecoder): + + def __init__(self, errors='strict'): + self.errors = errors + self.decoder = Charset.forName('Shift_JIS').newDecoder() + self.output_buffer = CharBuffer.allocate(1024) + self.buffer = '' + + def decode(self, input, final=False): + error_function = codecs.lookup_error(self.errors) + input_array = array('b', self.buffer + str(input)) + input_buffer = ByteBuffer.wrap(input_array) + builder = StringBuilder(int(self.decoder.averageCharsPerByte() * len(input))) + self.output_buffer.rewind() + + while True: + result = self.decoder.decode(input_buffer, self.output_buffer, final) + pos = self.output_buffer.position() + self.output_buffer.rewind() + builder.append(self.output_buffer.subSequence(0, pos)) + if result.isUnderflow(): + if not final: + # Keep around any remaining input for next call to decode + self.buffer = input_array[input_buffer.position():input_buffer.limit()].tostring() + else: + _process_incomplete_decode(decoder, input, error_function, input_buffer, builder) + break + _process_decode_errors(self.decoder, input, result, error_function, input_buffer, builder) + + return builder.toString() + + def reset(self): + self.buffer = "" + + def getstate(self): + return self.buffer or 0 + + def setstate(self, state): + self.buffer = state or "" + + +class StreamWriter(NonfinalCodec,codecs.StreamWriter): + pass + +class StreamReader(NonfinalCodec,codecs.StreamReader): + pass + +### encodings module API + +def getregentry(): + return codecs.CodecInfo( + name='shift_jis', + encode=Codec().encode, + decode=Codec().decode, + incrementalencoder=IncrementalEncoder, + incrementaldecoder=IncrementalDecoder, + streamreader=StreamReader, + streamwriter=StreamWriter, + ) + diff -r 6cee6fef06f0 Lib/test/test_codecs.py --- a/Lib/test/test_codecs.py Sun Jun 08 10:03:49 2014 +0100 +++ b/Lib/test/test_codecs.py Thu Jun 12 15:48:56 2014 -0600 @@ -1347,7 +1347,7 @@ "punycode", "raw_unicode_escape", "rot_13", -# FIXME: Jython issue 1066: 'shift_jis', + "shift_jis", # FIXME: Jython issue 1066: 'shift_jis_2004', # FIXME: Jython issue 1066: 'shift_jisx0213', "tis_620",