textutil/utf8.go - release.go.x.lib - Git at Google

 // Copyright 2015 The Vanadium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

 package textutil

 import (
 	"bytes"
 	"fmt"
 	"unicode/utf8"
 )

 // UTF8Encoder implements RuneEncoder for the UTF-8 encoding.
 type UTF8Encoder struct{}

 var _ RuneEncoder = UTF8Encoder{}

 // Encode encodes r into buf in the UTF-8 encoding.
 func (UTF8Encoder) Encode(r rune, buf *bytes.Buffer) { buf.WriteRune(r) }

 // UTF8ChunkDecoder implements RuneChunkDecoder for a stream of UTF-8 data that
 // is arbitrarily chunked.
 //
 // UTF-8 is a byte-wise encoding that may use multiple bytes to encode a single
 // rune.  This decoder buffers partial runes that have been split across chunks,
 // so that a full rune is returned when the subsequent data chunk is provided.
 //
 // This is commonly used to implement an io.Writer wrapper over UTF-8 text.  It
 // is useful since the data provided to Write calls may be arbitrarily chunked.
 //
 // The zero UTF8ChunkDecoder is a decoder with an empty buffer.
 type UTF8ChunkDecoder struct {
 	// The only state we keep is the last partial rune we've encountered.
 	partial    [utf8.UTFMax]byte
 	partialLen int
 }

 var _ RuneChunkDecoder = (*UTF8ChunkDecoder)(nil)

 // Decode returns a RuneStreamDecoder that decodes the data chunk.  Call Next
 // repeatedly on the returned stream until it returns EOF to decode the chunk.
 //
 // If the data is chunked in the middle of an encoded rune, the final partial
 // rune in the chunk will be buffered, and the next call to Decode will continue
 // by combining the buffered data with the next chunk.
 //
 // Invalid encodings are transformed into U+FFFD, one byte at a time.  See
 // unicode/utf8.DecodeRune for details.
 func (d *UTF8ChunkDecoder) Decode(chunk []byte) RuneStreamDecoder {
 	return &utf8Stream{d, chunk, 0}
 }

 // DecodeLeftover returns a RuneStreamDecoder that decodes leftover buffered
 // data.  Call Next repeatedly on the returned stream until it returns EOF to
 // ensure all buffered data is processed.
 //
 // Since the only data that is buffered is the final partial rune, the returned
 // RuneStreamDecoder will only contain U+FFFD or EOF.
 func (d *UTF8ChunkDecoder) DecodeLeftover() RuneStreamDecoder {
 	return &utf8LeftoverStream{d, 0}
 }

 // nextRune decodes the next rune, logically combining any previously buffered
 // data with the data chunk.  It returns the decoded rune and the byte size of
 // the data that was used for the decoding.
 //
 // The returned size may be > 0 even if the returned rune == EOF, if a partial
 // rune was detected and buffered.  The returned size may be 0 even if the
 // returned rune != EOF, if previously buffered data was decoded.
 func (d *UTF8ChunkDecoder) nextRune(data []byte) (rune, int) {
 	if d.partialLen > 0 {
 		return d.nextRunePartial(data)
 	}
 	r, size := utf8.DecodeRune(data)
 	if r == utf8.RuneError && !utf8.FullRune(data) {
 		// Initialize the partial rune buffer with remaining data.
 		d.partialLen = copy(d.partial[:], data)
 		return d.verifyPartial(d.partialLen, data)
 	}
 	return r, size
 }

 // nextRunePartial implements nextRune when there is a previously buffered
 // partial rune.
 func (d *UTF8ChunkDecoder) nextRunePartial(data []byte) (rune, int) {
 	// Append as much data as we can to the partial rune, and see if it's full.
 	oldLen := d.partialLen
 	d.partialLen += copy(d.partial[oldLen:], data)
 	if !utf8.FullRune(d.partial[:d.partialLen]) {
 		// We still don't have a full rune - keep waiting.
 		return d.verifyPartial(d.partialLen-oldLen, data)
 	}
 	// We finally have a full rune.
 	r, size := utf8.DecodeRune(d.partial[:d.partialLen])
 	if size < oldLen {
 		// This occurs when we have a multi-byte rune that has the right number of
 		// bytes, but is an invalid code point.
 		//
 		// Say oldLen=2, and we just received the third byte of a 3-byte rune which
 		// isn't a UTF-8 trailing byte.  In this case utf8.DecodeRune returns U+FFFD
 		// and size=1, to indicate we should skip the first byte.
 		//
 		// We shift the unread portion of the old partial data forward, and update
 		// the partial len so that it's strictly decreasing.  The strictly
 		// decreasing property isn't necessary for correctness, but helps avoid
 		// repeatedly copying data into the partial buffer unecessarily.
 		copy(d.partial[:], d.partial[size:oldLen])
 		d.partialLen = oldLen - size
 		return r, 0
 	}
 	// We've used all the old buffered data; start decoding directly from data.
 	d.partialLen = 0
 	return r, size - oldLen
 }

 // verifyPartial is called when we don't have a full rune, and ncopy bytes have
 // been copied from data into the decoder partial rune buffer.  We expect that
 // all data has been buffered and we return EOF and the total size of the data.
 func (d *UTF8ChunkDecoder) verifyPartial(ncopy int, data []byte) (rune, int) {
 	if ncopy < len(data) {
 		// Something's very wrong if we managed to fill d.partial without copying
 		// all the data; any sequence of utf8.UTFMax bytes must be a full rune.
 		panic(fmt.Errorf("UTF8ChunkDecoder: partial rune %v with leftover data %v", d.partial[:d.partialLen], data[ncopy:]))
 	}
 	return EOF, len(data)
 }

 // utf8Stream implements UTF8ChunkDecoder.Decode.
 type utf8Stream struct {
 	d    *UTF8ChunkDecoder
 	data []byte
 	pos  int
 }

 var _ RuneStreamDecoder = (*utf8Stream)(nil)

 func (s *utf8Stream) Next() rune {
 	if s.pos == len(s.data) {
 		return EOF
 	}
 	r, size := s.d.nextRune(s.data[s.pos:])
 	s.pos += size
 	return r
 }

 func (s *utf8Stream) BytePos() int {
 	return s.pos
 }

 // utf8LeftoverStream implements UTF8ChunkDecoder.DecodeLeftover.
 type utf8LeftoverStream struct {
 	d   *UTF8ChunkDecoder
 	pos int
 }

 var _ RuneStreamDecoder = (*utf8LeftoverStream)(nil)

 func (s *utf8LeftoverStream) Next() rune {
 	if s.d.partialLen == 0 {
 		return EOF
 	}
 	r, size := utf8.DecodeRune(s.d.partial[:s.d.partialLen])
 	copy(s.d.partial[:], s.d.partial[size:])
 	s.d.partialLen -= size
 	s.pos += size
 	return r
 }

 func (s *utf8LeftoverStream) BytePos() int {
 	return s.pos
 }
	// Copyright 2015 The Vanadium Authors. All rights reserved.
	// Use of this source code is governed by a BSD-style
	// license that can be found in the LICENSE file.

	package textutil

	import (
	"bytes"
	"fmt"
	"unicode/utf8"
	)

	// UTF8Encoder implements RuneEncoder for the UTF-8 encoding.
	type UTF8Encoder struct{}

	var _ RuneEncoder = UTF8Encoder{}

	// Encode encodes r into buf in the UTF-8 encoding.
	func (UTF8Encoder) Encode(r rune, buf *bytes.Buffer) { buf.WriteRune(r) }

	// UTF8ChunkDecoder implements RuneChunkDecoder for a stream of UTF-8 data that
	// is arbitrarily chunked.
	//
	// UTF-8 is a byte-wise encoding that may use multiple bytes to encode a single
	// rune. This decoder buffers partial runes that have been split across chunks,
	// so that a full rune is returned when the subsequent data chunk is provided.
	//
	// This is commonly used to implement an io.Writer wrapper over UTF-8 text. It
	// is useful since the data provided to Write calls may be arbitrarily chunked.
	//
	// The zero UTF8ChunkDecoder is a decoder with an empty buffer.
	type UTF8ChunkDecoder struct {
	// The only state we keep is the last partial rune we've encountered.
	partial [utf8.UTFMax]byte
	partialLen int
	}

	var _ RuneChunkDecoder = (*UTF8ChunkDecoder)(nil)

	// Decode returns a RuneStreamDecoder that decodes the data chunk. Call Next
	// repeatedly on the returned stream until it returns EOF to decode the chunk.
	//
	// If the data is chunked in the middle of an encoded rune, the final partial
	// rune in the chunk will be buffered, and the next call to Decode will continue
	// by combining the buffered data with the next chunk.
	//
	// Invalid encodings are transformed into U+FFFD, one byte at a time. See
	// unicode/utf8.DecodeRune for details.
	func (d *UTF8ChunkDecoder) Decode(chunk []byte) RuneStreamDecoder {
	return &utf8Stream{d, chunk, 0}
	}

	// DecodeLeftover returns a RuneStreamDecoder that decodes leftover buffered
	// data. Call Next repeatedly on the returned stream until it returns EOF to
	// ensure all buffered data is processed.
	//
	// Since the only data that is buffered is the final partial rune, the returned
	// RuneStreamDecoder will only contain U+FFFD or EOF.
	func (d *UTF8ChunkDecoder) DecodeLeftover() RuneStreamDecoder {
	return &utf8LeftoverStream{d, 0}
	}

	// nextRune decodes the next rune, logically combining any previously buffered
	// data with the data chunk. It returns the decoded rune and the byte size of
	// the data that was used for the decoding.
	//
	// The returned size may be > 0 even if the returned rune == EOF, if a partial
	// rune was detected and buffered. The returned size may be 0 even if the
	// returned rune != EOF, if previously buffered data was decoded.
	func (d *UTF8ChunkDecoder) nextRune(data []byte) (rune, int) {
	if d.partialLen > 0 {
	return d.nextRunePartial(data)
	}
	r, size := utf8.DecodeRune(data)
	if r == utf8.RuneError && !utf8.FullRune(data) {
	// Initialize the partial rune buffer with remaining data.
	d.partialLen = copy(d.partial[:], data)
	return d.verifyPartial(d.partialLen, data)
	}
	return r, size
	}

	// nextRunePartial implements nextRune when there is a previously buffered
	// partial rune.
	func (d *UTF8ChunkDecoder) nextRunePartial(data []byte) (rune, int) {
	// Append as much data as we can to the partial rune, and see if it's full.
	oldLen := d.partialLen
	d.partialLen += copy(d.partial[oldLen:], data)
	if !utf8.FullRune(d.partial[:d.partialLen]) {
	// We still don't have a full rune - keep waiting.
	return d.verifyPartial(d.partialLen-oldLen, data)
	}
	// We finally have a full rune.
	r, size := utf8.DecodeRune(d.partial[:d.partialLen])
	if size < oldLen {
	// This occurs when we have a multi-byte rune that has the right number of
	// bytes, but is an invalid code point.
	//
	// Say oldLen=2, and we just received the third byte of a 3-byte rune which
	// isn't a UTF-8 trailing byte. In this case utf8.DecodeRune returns U+FFFD
	// and size=1, to indicate we should skip the first byte.
	//
	// We shift the unread portion of the old partial data forward, and update
	// the partial len so that it's strictly decreasing. The strictly
	// decreasing property isn't necessary for correctness, but helps avoid
	// repeatedly copying data into the partial buffer unecessarily.
	copy(d.partial[:], d.partial[size:oldLen])
	d.partialLen = oldLen - size
	return r, 0
	}
	// We've used all the old buffered data; start decoding directly from data.
	d.partialLen = 0
	return r, size - oldLen
	}

	// verifyPartial is called when we don't have a full rune, and ncopy bytes have
	// been copied from data into the decoder partial rune buffer. We expect that
	// all data has been buffered and we return EOF and the total size of the data.
	func (d *UTF8ChunkDecoder) verifyPartial(ncopy int, data []byte) (rune, int) {
	if ncopy < len(data) {
	// Something's very wrong if we managed to fill d.partial without copying
	// all the data; any sequence of utf8.UTFMax bytes must be a full rune.
	panic(fmt.Errorf("UTF8ChunkDecoder: partial rune %v with leftover data %v", d.partial[:d.partialLen], data[ncopy:]))
	}
	return EOF, len(data)
	}

	// utf8Stream implements UTF8ChunkDecoder.Decode.
	type utf8Stream struct {
	d *UTF8ChunkDecoder
	data []byte
	pos int
	}

	var _ RuneStreamDecoder = (*utf8Stream)(nil)

	func (s *utf8Stream) Next() rune {
	if s.pos == len(s.data) {
	return EOF
	}
	r, size := s.d.nextRune(s.data[s.pos:])
	s.pos += size
	return r
	}

	func (s *utf8Stream) BytePos() int {
	return s.pos
	}

	// utf8LeftoverStream implements UTF8ChunkDecoder.DecodeLeftover.
	type utf8LeftoverStream struct {
	d *UTF8ChunkDecoder
	pos int
	}

	var _ RuneStreamDecoder = (*utf8LeftoverStream)(nil)

	func (s *utf8LeftoverStream) Next() rune {
	if s.d.partialLen == 0 {
	return EOF
	}
	r, size := utf8.DecodeRune(s.d.partial[:s.d.partialLen])
	copy(s.d.partial[:], s.d.partial[size:])
	s.d.partialLen -= size
	s.pos += size
	return r
	}

	func (s *utf8LeftoverStream) BytePos() int {
	return s.pos
	}