mirror of https://github.com/gogits/gogs.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
335 lines
9.5 KiB
335 lines
9.5 KiB
// Copyright 2013 The Go Authors. All rights reserved. |
|
// Use of this source code is governed by a BSD-style |
|
// license that can be found in the LICENSE file. |
|
|
|
// Package encoding defines an interface for character encodings, such as Shift |
|
// JIS and Windows 1252, that can convert to and from UTF-8. |
|
// |
|
// Encoding implementations are provided in other packages, such as |
|
// golang.org/x/text/encoding/charmap and |
|
// golang.org/x/text/encoding/japanese. |
|
package encoding // import "golang.org/x/text/encoding" |
|
|
|
import ( |
|
"errors" |
|
"io" |
|
"strconv" |
|
"unicode/utf8" |
|
|
|
"golang.org/x/text/encoding/internal/identifier" |
|
"golang.org/x/text/transform" |
|
) |
|
|
|
// TODO: |
|
// - There seems to be some inconsistency in when decoders return errors |
|
// and when not. Also documentation seems to suggest they shouldn't return |
|
// errors at all (except for UTF-16). |
|
// - Encoders seem to rely on or at least benefit from the input being in NFC |
|
// normal form. Perhaps add an example how users could prepare their output. |
|
|
|
// Encoding is a character set encoding that can be transformed to and from |
|
// UTF-8. |
|
type Encoding interface { |
|
// NewDecoder returns a Decoder. |
|
NewDecoder() *Decoder |
|
|
|
// NewEncoder returns an Encoder. |
|
NewEncoder() *Encoder |
|
} |
|
|
|
// A Decoder converts bytes to UTF-8. It implements transform.Transformer. |
|
// |
|
// Transforming source bytes that are not of that encoding will not result in an |
|
// error per se. Each byte that cannot be transcoded will be represented in the |
|
// output by the UTF-8 encoding of '\uFFFD', the replacement rune. |
|
type Decoder struct { |
|
transform.Transformer |
|
|
|
// This forces external creators of Decoders to use names in struct |
|
// initializers, allowing for future extendibility without having to break |
|
// code. |
|
_ struct{} |
|
} |
|
|
|
// Bytes converts the given encoded bytes to UTF-8. It returns the converted |
|
// bytes or nil, err if any error occurred. |
|
func (d *Decoder) Bytes(b []byte) ([]byte, error) { |
|
b, _, err := transform.Bytes(d, b) |
|
if err != nil { |
|
return nil, err |
|
} |
|
return b, nil |
|
} |
|
|
|
// String converts the given encoded string to UTF-8. It returns the converted |
|
// string or "", err if any error occurred. |
|
func (d *Decoder) String(s string) (string, error) { |
|
s, _, err := transform.String(d, s) |
|
if err != nil { |
|
return "", err |
|
} |
|
return s, nil |
|
} |
|
|
|
// Reader wraps another Reader to decode its bytes. |
|
// |
|
// The Decoder may not be used for any other operation as long as the returned |
|
// Reader is in use. |
|
func (d *Decoder) Reader(r io.Reader) io.Reader { |
|
return transform.NewReader(r, d) |
|
} |
|
|
|
// An Encoder converts bytes from UTF-8. It implements transform.Transformer. |
|
// |
|
// Each rune that cannot be transcoded will result in an error. In this case, |
|
// the transform will consume all source byte up to, not including the offending |
|
// rune. Transforming source bytes that are not valid UTF-8 will be replaced by |
|
// `\uFFFD`. To return early with an error instead, use transform.Chain to |
|
// preprocess the data with a UTF8Validator. |
|
type Encoder struct { |
|
transform.Transformer |
|
|
|
// This forces external creators of Encoders to use names in struct |
|
// initializers, allowing for future extendibility without having to break |
|
// code. |
|
_ struct{} |
|
} |
|
|
|
// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if |
|
// any error occurred. |
|
func (e *Encoder) Bytes(b []byte) ([]byte, error) { |
|
b, _, err := transform.Bytes(e, b) |
|
if err != nil { |
|
return nil, err |
|
} |
|
return b, nil |
|
} |
|
|
|
// String converts a string from UTF-8. It returns the converted string or |
|
// "", err if any error occurred. |
|
func (e *Encoder) String(s string) (string, error) { |
|
s, _, err := transform.String(e, s) |
|
if err != nil { |
|
return "", err |
|
} |
|
return s, nil |
|
} |
|
|
|
// Writer wraps another Writer to encode its UTF-8 output. |
|
// |
|
// The Encoder may not be used for any other operation as long as the returned |
|
// Writer is in use. |
|
func (e *Encoder) Writer(w io.Writer) io.Writer { |
|
return transform.NewWriter(w, e) |
|
} |
|
|
|
// ASCIISub is the ASCII substitute character, as recommended by |
|
// http://unicode.org/reports/tr36/#Text_Comparison |
|
const ASCIISub = '\x1a' |
|
|
|
// Nop is the nop encoding. Its transformed bytes are the same as the source |
|
// bytes; it does not replace invalid UTF-8 sequences. |
|
var Nop Encoding = nop{} |
|
|
|
type nop struct{} |
|
|
|
func (nop) NewDecoder() *Decoder { |
|
return &Decoder{Transformer: transform.Nop} |
|
} |
|
func (nop) NewEncoder() *Encoder { |
|
return &Encoder{Transformer: transform.Nop} |
|
} |
|
|
|
// Replacement is the replacement encoding. Decoding from the replacement |
|
// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to |
|
// the replacement encoding yields the same as the source bytes except that |
|
// invalid UTF-8 is converted to '\uFFFD'. |
|
// |
|
// It is defined at http://encoding.spec.whatwg.org/#replacement |
|
var Replacement Encoding = replacement{} |
|
|
|
type replacement struct{} |
|
|
|
func (replacement) NewDecoder() *Decoder { |
|
return &Decoder{Transformer: replacementDecoder{}} |
|
} |
|
|
|
func (replacement) NewEncoder() *Encoder { |
|
return &Encoder{Transformer: replacementEncoder{}} |
|
} |
|
|
|
func (replacement) ID() (mib identifier.MIB, other string) { |
|
return identifier.Replacement, "" |
|
} |
|
|
|
type replacementDecoder struct{ transform.NopResetter } |
|
|
|
func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
if len(dst) < 3 { |
|
return 0, 0, transform.ErrShortDst |
|
} |
|
if atEOF { |
|
const fffd = "\ufffd" |
|
dst[0] = fffd[0] |
|
dst[1] = fffd[1] |
|
dst[2] = fffd[2] |
|
nDst = 3 |
|
} |
|
return nDst, len(src), nil |
|
} |
|
|
|
type replacementEncoder struct{ transform.NopResetter } |
|
|
|
func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
r, size := rune(0), 0 |
|
|
|
for ; nSrc < len(src); nSrc += size { |
|
r = rune(src[nSrc]) |
|
|
|
// Decode a 1-byte rune. |
|
if r < utf8.RuneSelf { |
|
size = 1 |
|
|
|
} else { |
|
// Decode a multi-byte rune. |
|
r, size = utf8.DecodeRune(src[nSrc:]) |
|
if size == 1 { |
|
// All valid runes of size 1 (those below utf8.RuneSelf) were |
|
// handled above. We have invalid UTF-8 or we haven't seen the |
|
// full character yet. |
|
if !atEOF && !utf8.FullRune(src[nSrc:]) { |
|
err = transform.ErrShortSrc |
|
break |
|
} |
|
r = '\ufffd' |
|
} |
|
} |
|
|
|
if nDst+utf8.RuneLen(r) > len(dst) { |
|
err = transform.ErrShortDst |
|
break |
|
} |
|
nDst += utf8.EncodeRune(dst[nDst:], r) |
|
} |
|
return nDst, nSrc, err |
|
} |
|
|
|
// HTMLEscapeUnsupported wraps encoders to replace source runes outside the |
|
// repertoire of the destination encoding with HTML escape sequences. |
|
// |
|
// This wrapper exists to comply to URL and HTML forms requiring a |
|
// non-terminating legacy encoder. The produced sequences may lead to data |
|
// loss as they are indistinguishable from legitimate input. To avoid this |
|
// issue, use UTF-8 encodings whenever possible. |
|
func HTMLEscapeUnsupported(e *Encoder) *Encoder { |
|
return &Encoder{Transformer: &errorHandler{e, errorToHTML}} |
|
} |
|
|
|
// ReplaceUnsupported wraps encoders to replace source runes outside the |
|
// repertoire of the destination encoding with an encoding-specific |
|
// replacement. |
|
// |
|
// This wrapper is only provided for backwards compatibility and legacy |
|
// handling. Its use is strongly discouraged. Use UTF-8 whenever possible. |
|
func ReplaceUnsupported(e *Encoder) *Encoder { |
|
return &Encoder{Transformer: &errorHandler{e, errorToReplacement}} |
|
} |
|
|
|
type errorHandler struct { |
|
*Encoder |
|
handler func(dst []byte, r rune, err repertoireError) (n int, ok bool) |
|
} |
|
|
|
// TODO: consider making this error public in some form. |
|
type repertoireError interface { |
|
Replacement() byte |
|
} |
|
|
|
func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF) |
|
for err != nil { |
|
rerr, ok := err.(repertoireError) |
|
if !ok { |
|
return nDst, nSrc, err |
|
} |
|
r, sz := utf8.DecodeRune(src[nSrc:]) |
|
n, ok := h.handler(dst[nDst:], r, rerr) |
|
if !ok { |
|
return nDst, nSrc, transform.ErrShortDst |
|
} |
|
err = nil |
|
nDst += n |
|
if nSrc += sz; nSrc < len(src) { |
|
var dn, sn int |
|
dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF) |
|
nDst += dn |
|
nSrc += sn |
|
} |
|
} |
|
return nDst, nSrc, err |
|
} |
|
|
|
func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) { |
|
buf := [8]byte{} |
|
b := strconv.AppendUint(buf[:0], uint64(r), 10) |
|
if n = len(b) + len("&#;"); n >= len(dst) { |
|
return 0, false |
|
} |
|
dst[0] = '&' |
|
dst[1] = '#' |
|
dst[copy(dst[2:], b)+2] = ';' |
|
return n, true |
|
} |
|
|
|
func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) { |
|
if len(dst) == 0 { |
|
return 0, false |
|
} |
|
dst[0] = err.Replacement() |
|
return 1, true |
|
} |
|
|
|
// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8. |
|
var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8") |
|
|
|
// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first |
|
// input byte that is not valid UTF-8. |
|
var UTF8Validator transform.Transformer = utf8Validator{} |
|
|
|
type utf8Validator struct{ transform.NopResetter } |
|
|
|
func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
n := len(src) |
|
if n > len(dst) { |
|
n = len(dst) |
|
} |
|
for i := 0; i < n; { |
|
if c := src[i]; c < utf8.RuneSelf { |
|
dst[i] = c |
|
i++ |
|
continue |
|
} |
|
_, size := utf8.DecodeRune(src[i:]) |
|
if size == 1 { |
|
// All valid runes of size 1 (those below utf8.RuneSelf) were |
|
// handled above. We have invalid UTF-8 or we haven't seen the |
|
// full character yet. |
|
err = ErrInvalidUTF8 |
|
if !atEOF && !utf8.FullRune(src[i:]) { |
|
err = transform.ErrShortSrc |
|
} |
|
return i, i, err |
|
} |
|
if i+size > len(dst) { |
|
return i, i, transform.ErrShortDst |
|
} |
|
for ; size > 0; size-- { |
|
dst[i] = src[i] |
|
i++ |
|
} |
|
} |
|
if len(src) > len(dst) { |
|
err = transform.ErrShortDst |
|
} |
|
return n, n, err |
|
}
|
|
|