mirror of https://github.com/gogits/gogs.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
258 lines
5.9 KiB
258 lines
5.9 KiB
// Copyright 2010 The Go Authors. All rights reserved. |
|
// Use of this source code is governed by a BSD-style |
|
// license that can be found in the LICENSE file. |
|
|
|
package html |
|
|
|
import ( |
|
"bytes" |
|
"strings" |
|
"unicode/utf8" |
|
) |
|
|
|
// These replacements permit compatibility with old numeric entities that |
|
// assumed Windows-1252 encoding. |
|
// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference |
|
var replacementTable = [...]rune{ |
|
'\u20AC', // First entry is what 0x80 should be replaced with. |
|
'\u0081', |
|
'\u201A', |
|
'\u0192', |
|
'\u201E', |
|
'\u2026', |
|
'\u2020', |
|
'\u2021', |
|
'\u02C6', |
|
'\u2030', |
|
'\u0160', |
|
'\u2039', |
|
'\u0152', |
|
'\u008D', |
|
'\u017D', |
|
'\u008F', |
|
'\u0090', |
|
'\u2018', |
|
'\u2019', |
|
'\u201C', |
|
'\u201D', |
|
'\u2022', |
|
'\u2013', |
|
'\u2014', |
|
'\u02DC', |
|
'\u2122', |
|
'\u0161', |
|
'\u203A', |
|
'\u0153', |
|
'\u009D', |
|
'\u017E', |
|
'\u0178', // Last entry is 0x9F. |
|
// 0x00->'\uFFFD' is handled programmatically. |
|
// 0x0D->'\u000D' is a no-op. |
|
} |
|
|
|
// unescapeEntity reads an entity like "<" from b[src:] and writes the |
|
// corresponding "<" to b[dst:], returning the incremented dst and src cursors. |
|
// Precondition: b[src] == '&' && dst <= src. |
|
// attribute should be true if parsing an attribute value. |
|
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) { |
|
// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference |
|
|
|
// i starts at 1 because we already know that s[0] == '&'. |
|
i, s := 1, b[src:] |
|
|
|
if len(s) <= 1 { |
|
b[dst] = b[src] |
|
return dst + 1, src + 1 |
|
} |
|
|
|
if s[i] == '#' { |
|
if len(s) <= 3 { // We need to have at least "&#.". |
|
b[dst] = b[src] |
|
return dst + 1, src + 1 |
|
} |
|
i++ |
|
c := s[i] |
|
hex := false |
|
if c == 'x' || c == 'X' { |
|
hex = true |
|
i++ |
|
} |
|
|
|
x := '\x00' |
|
for i < len(s) { |
|
c = s[i] |
|
i++ |
|
if hex { |
|
if '0' <= c && c <= '9' { |
|
x = 16*x + rune(c) - '0' |
|
continue |
|
} else if 'a' <= c && c <= 'f' { |
|
x = 16*x + rune(c) - 'a' + 10 |
|
continue |
|
} else if 'A' <= c && c <= 'F' { |
|
x = 16*x + rune(c) - 'A' + 10 |
|
continue |
|
} |
|
} else if '0' <= c && c <= '9' { |
|
x = 10*x + rune(c) - '0' |
|
continue |
|
} |
|
if c != ';' { |
|
i-- |
|
} |
|
break |
|
} |
|
|
|
if i <= 3 { // No characters matched. |
|
b[dst] = b[src] |
|
return dst + 1, src + 1 |
|
} |
|
|
|
if 0x80 <= x && x <= 0x9F { |
|
// Replace characters from Windows-1252 with UTF-8 equivalents. |
|
x = replacementTable[x-0x80] |
|
} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF { |
|
// Replace invalid characters with the replacement character. |
|
x = '\uFFFD' |
|
} |
|
|
|
return dst + utf8.EncodeRune(b[dst:], x), src + i |
|
} |
|
|
|
// Consume the maximum number of characters possible, with the |
|
// consumed characters matching one of the named references. |
|
|
|
for i < len(s) { |
|
c := s[i] |
|
i++ |
|
// Lower-cased characters are more common in entities, so we check for them first. |
|
if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' { |
|
continue |
|
} |
|
if c != ';' { |
|
i-- |
|
} |
|
break |
|
} |
|
|
|
entityName := string(s[1:i]) |
|
if entityName == "" { |
|
// No-op. |
|
} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' { |
|
// No-op. |
|
} else if x := entity[entityName]; x != 0 { |
|
return dst + utf8.EncodeRune(b[dst:], x), src + i |
|
} else if x := entity2[entityName]; x[0] != 0 { |
|
dst1 := dst + utf8.EncodeRune(b[dst:], x[0]) |
|
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i |
|
} else if !attribute { |
|
maxLen := len(entityName) - 1 |
|
if maxLen > longestEntityWithoutSemicolon { |
|
maxLen = longestEntityWithoutSemicolon |
|
} |
|
for j := maxLen; j > 1; j-- { |
|
if x := entity[entityName[:j]]; x != 0 { |
|
return dst + utf8.EncodeRune(b[dst:], x), src + j + 1 |
|
} |
|
} |
|
} |
|
|
|
dst1, src1 = dst+i, src+i |
|
copy(b[dst:dst1], b[src:src1]) |
|
return dst1, src1 |
|
} |
|
|
|
// unescape unescapes b's entities in-place, so that "a<b" becomes "a<b". |
|
// attribute should be true if parsing an attribute value. |
|
func unescape(b []byte, attribute bool) []byte { |
|
for i, c := range b { |
|
if c == '&' { |
|
dst, src := unescapeEntity(b, i, i, attribute) |
|
for src < len(b) { |
|
c := b[src] |
|
if c == '&' { |
|
dst, src = unescapeEntity(b, dst, src, attribute) |
|
} else { |
|
b[dst] = c |
|
dst, src = dst+1, src+1 |
|
} |
|
} |
|
return b[0:dst] |
|
} |
|
} |
|
return b |
|
} |
|
|
|
// lower lower-cases the A-Z bytes in b in-place, so that "aBc" becomes "abc". |
|
func lower(b []byte) []byte { |
|
for i, c := range b { |
|
if 'A' <= c && c <= 'Z' { |
|
b[i] = c + 'a' - 'A' |
|
} |
|
} |
|
return b |
|
} |
|
|
|
const escapedChars = "&'<>\"\r" |
|
|
|
func escape(w writer, s string) error { |
|
i := strings.IndexAny(s, escapedChars) |
|
for i != -1 { |
|
if _, err := w.WriteString(s[:i]); err != nil { |
|
return err |
|
} |
|
var esc string |
|
switch s[i] { |
|
case '&': |
|
esc = "&" |
|
case '\'': |
|
// "'" is shorter than "'" and apos was not in HTML until HTML5. |
|
esc = "'" |
|
case '<': |
|
esc = "<" |
|
case '>': |
|
esc = ">" |
|
case '"': |
|
// """ is shorter than """. |
|
esc = """ |
|
case '\r': |
|
esc = " " |
|
default: |
|
panic("unrecognized escape character") |
|
} |
|
s = s[i+1:] |
|
if _, err := w.WriteString(esc); err != nil { |
|
return err |
|
} |
|
i = strings.IndexAny(s, escapedChars) |
|
} |
|
_, err := w.WriteString(s) |
|
return err |
|
} |
|
|
|
// EscapeString escapes special characters like "<" to become "<". It |
|
// escapes only five such characters: <, >, &, ' and ". |
|
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't |
|
// always true. |
|
func EscapeString(s string) string { |
|
if strings.IndexAny(s, escapedChars) == -1 { |
|
return s |
|
} |
|
var buf bytes.Buffer |
|
escape(&buf, s) |
|
return buf.String() |
|
} |
|
|
|
// UnescapeString unescapes entities like "<" to become "<". It unescapes a |
|
// larger range of entities than EscapeString escapes. For example, "á" |
|
// unescapes to "รก", as does "á" and "&xE1;". |
|
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't |
|
// always true. |
|
func UnescapeString(s string) string { |
|
for _, c := range s { |
|
if c == '&' { |
|
return string(unescape([]byte(s), false)) |
|
} |
|
} |
|
return s |
|
}
|
|
|