mirror of https://github.com/gogits/gogs.git
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2094 lines
48 KiB
2094 lines
48 KiB
// Copyright 2010 The Go Authors. All rights reserved. |
|
// Use of this source code is governed by a BSD-style |
|
// license that can be found in the LICENSE file. |
|
|
|
package html |
|
|
|
import ( |
|
"errors" |
|
"fmt" |
|
"io" |
|
"strings" |
|
|
|
a "golang.org/x/net/html/atom" |
|
) |
|
|
|
// A parser implements the HTML5 parsing algorithm: |
|
// https://html.spec.whatwg.org/multipage/syntax.html#tree-construction |
|
type parser struct { |
|
// tokenizer provides the tokens for the parser. |
|
tokenizer *Tokenizer |
|
// tok is the most recently read token. |
|
tok Token |
|
// Self-closing tags like <hr/> are treated as start tags, except that |
|
// hasSelfClosingToken is set while they are being processed. |
|
hasSelfClosingToken bool |
|
// doc is the document root element. |
|
doc *Node |
|
// The stack of open elements (section 12.2.3.2) and active formatting |
|
// elements (section 12.2.3.3). |
|
oe, afe nodeStack |
|
// Element pointers (section 12.2.3.4). |
|
head, form *Node |
|
// Other parsing state flags (section 12.2.3.5). |
|
scripting, framesetOK bool |
|
// im is the current insertion mode. |
|
im insertionMode |
|
// originalIM is the insertion mode to go back to after completing a text |
|
// or inTableText insertion mode. |
|
originalIM insertionMode |
|
// fosterParenting is whether new elements should be inserted according to |
|
// the foster parenting rules (section 12.2.5.3). |
|
fosterParenting bool |
|
// quirks is whether the parser is operating in "quirks mode." |
|
quirks bool |
|
// fragment is whether the parser is parsing an HTML fragment. |
|
fragment bool |
|
// context is the context element when parsing an HTML fragment |
|
// (section 12.4). |
|
context *Node |
|
} |
|
|
|
func (p *parser) top() *Node { |
|
if n := p.oe.top(); n != nil { |
|
return n |
|
} |
|
return p.doc |
|
} |
|
|
|
// Stop tags for use in popUntil. These come from section 12.2.3.2. |
|
var ( |
|
defaultScopeStopTags = map[string][]a.Atom{ |
|
"": {a.Applet, a.Caption, a.Html, a.Table, a.Td, a.Th, a.Marquee, a.Object, a.Template}, |
|
"math": {a.AnnotationXml, a.Mi, a.Mn, a.Mo, a.Ms, a.Mtext}, |
|
"svg": {a.Desc, a.ForeignObject, a.Title}, |
|
} |
|
) |
|
|
|
type scope int |
|
|
|
const ( |
|
defaultScope scope = iota |
|
listItemScope |
|
buttonScope |
|
tableScope |
|
tableRowScope |
|
tableBodyScope |
|
selectScope |
|
) |
|
|
|
// popUntil pops the stack of open elements at the highest element whose tag |
|
// is in matchTags, provided there is no higher element in the scope's stop |
|
// tags (as defined in section 12.2.3.2). It returns whether or not there was |
|
// such an element. If there was not, popUntil leaves the stack unchanged. |
|
// |
|
// For example, the set of stop tags for table scope is: "html", "table". If |
|
// the stack was: |
|
// ["html", "body", "font", "table", "b", "i", "u"] |
|
// then popUntil(tableScope, "font") would return false, but |
|
// popUntil(tableScope, "i") would return true and the stack would become: |
|
// ["html", "body", "font", "table", "b"] |
|
// |
|
// If an element's tag is in both the stop tags and matchTags, then the stack |
|
// will be popped and the function returns true (provided, of course, there was |
|
// no higher element in the stack that was also in the stop tags). For example, |
|
// popUntil(tableScope, "table") returns true and leaves: |
|
// ["html", "body", "font"] |
|
func (p *parser) popUntil(s scope, matchTags ...a.Atom) bool { |
|
if i := p.indexOfElementInScope(s, matchTags...); i != -1 { |
|
p.oe = p.oe[:i] |
|
return true |
|
} |
|
return false |
|
} |
|
|
|
// indexOfElementInScope returns the index in p.oe of the highest element whose |
|
// tag is in matchTags that is in scope. If no matching element is in scope, it |
|
// returns -1. |
|
func (p *parser) indexOfElementInScope(s scope, matchTags ...a.Atom) int { |
|
for i := len(p.oe) - 1; i >= 0; i-- { |
|
tagAtom := p.oe[i].DataAtom |
|
if p.oe[i].Namespace == "" { |
|
for _, t := range matchTags { |
|
if t == tagAtom { |
|
return i |
|
} |
|
} |
|
switch s { |
|
case defaultScope: |
|
// No-op. |
|
case listItemScope: |
|
if tagAtom == a.Ol || tagAtom == a.Ul { |
|
return -1 |
|
} |
|
case buttonScope: |
|
if tagAtom == a.Button { |
|
return -1 |
|
} |
|
case tableScope: |
|
if tagAtom == a.Html || tagAtom == a.Table { |
|
return -1 |
|
} |
|
case selectScope: |
|
if tagAtom != a.Optgroup && tagAtom != a.Option { |
|
return -1 |
|
} |
|
default: |
|
panic("unreachable") |
|
} |
|
} |
|
switch s { |
|
case defaultScope, listItemScope, buttonScope: |
|
for _, t := range defaultScopeStopTags[p.oe[i].Namespace] { |
|
if t == tagAtom { |
|
return -1 |
|
} |
|
} |
|
} |
|
} |
|
return -1 |
|
} |
|
|
|
// elementInScope is like popUntil, except that it doesn't modify the stack of |
|
// open elements. |
|
func (p *parser) elementInScope(s scope, matchTags ...a.Atom) bool { |
|
return p.indexOfElementInScope(s, matchTags...) != -1 |
|
} |
|
|
|
// clearStackToContext pops elements off the stack of open elements until a |
|
// scope-defined element is found. |
|
func (p *parser) clearStackToContext(s scope) { |
|
for i := len(p.oe) - 1; i >= 0; i-- { |
|
tagAtom := p.oe[i].DataAtom |
|
switch s { |
|
case tableScope: |
|
if tagAtom == a.Html || tagAtom == a.Table { |
|
p.oe = p.oe[:i+1] |
|
return |
|
} |
|
case tableRowScope: |
|
if tagAtom == a.Html || tagAtom == a.Tr { |
|
p.oe = p.oe[:i+1] |
|
return |
|
} |
|
case tableBodyScope: |
|
if tagAtom == a.Html || tagAtom == a.Tbody || tagAtom == a.Tfoot || tagAtom == a.Thead { |
|
p.oe = p.oe[:i+1] |
|
return |
|
} |
|
default: |
|
panic("unreachable") |
|
} |
|
} |
|
} |
|
|
|
// generateImpliedEndTags pops nodes off the stack of open elements as long as |
|
// the top node has a tag name of dd, dt, li, option, optgroup, p, rp, or rt. |
|
// If exceptions are specified, nodes with that name will not be popped off. |
|
func (p *parser) generateImpliedEndTags(exceptions ...string) { |
|
var i int |
|
loop: |
|
for i = len(p.oe) - 1; i >= 0; i-- { |
|
n := p.oe[i] |
|
if n.Type == ElementNode { |
|
switch n.DataAtom { |
|
case a.Dd, a.Dt, a.Li, a.Option, a.Optgroup, a.P, a.Rp, a.Rt: |
|
for _, except := range exceptions { |
|
if n.Data == except { |
|
break loop |
|
} |
|
} |
|
continue |
|
} |
|
} |
|
break |
|
} |
|
|
|
p.oe = p.oe[:i+1] |
|
} |
|
|
|
// addChild adds a child node n to the top element, and pushes n onto the stack |
|
// of open elements if it is an element node. |
|
func (p *parser) addChild(n *Node) { |
|
if p.shouldFosterParent() { |
|
p.fosterParent(n) |
|
} else { |
|
p.top().AppendChild(n) |
|
} |
|
|
|
if n.Type == ElementNode { |
|
p.oe = append(p.oe, n) |
|
} |
|
} |
|
|
|
// shouldFosterParent returns whether the next node to be added should be |
|
// foster parented. |
|
func (p *parser) shouldFosterParent() bool { |
|
if p.fosterParenting { |
|
switch p.top().DataAtom { |
|
case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: |
|
return true |
|
} |
|
} |
|
return false |
|
} |
|
|
|
// fosterParent adds a child node according to the foster parenting rules. |
|
// Section 12.2.5.3, "foster parenting". |
|
func (p *parser) fosterParent(n *Node) { |
|
var table, parent, prev *Node |
|
var i int |
|
for i = len(p.oe) - 1; i >= 0; i-- { |
|
if p.oe[i].DataAtom == a.Table { |
|
table = p.oe[i] |
|
break |
|
} |
|
} |
|
|
|
if table == nil { |
|
// The foster parent is the html element. |
|
parent = p.oe[0] |
|
} else { |
|
parent = table.Parent |
|
} |
|
if parent == nil { |
|
parent = p.oe[i-1] |
|
} |
|
|
|
if table != nil { |
|
prev = table.PrevSibling |
|
} else { |
|
prev = parent.LastChild |
|
} |
|
if prev != nil && prev.Type == TextNode && n.Type == TextNode { |
|
prev.Data += n.Data |
|
return |
|
} |
|
|
|
parent.InsertBefore(n, table) |
|
} |
|
|
|
// addText adds text to the preceding node if it is a text node, or else it |
|
// calls addChild with a new text node. |
|
func (p *parser) addText(text string) { |
|
if text == "" { |
|
return |
|
} |
|
|
|
if p.shouldFosterParent() { |
|
p.fosterParent(&Node{ |
|
Type: TextNode, |
|
Data: text, |
|
}) |
|
return |
|
} |
|
|
|
t := p.top() |
|
if n := t.LastChild; n != nil && n.Type == TextNode { |
|
n.Data += text |
|
return |
|
} |
|
p.addChild(&Node{ |
|
Type: TextNode, |
|
Data: text, |
|
}) |
|
} |
|
|
|
// addElement adds a child element based on the current token. |
|
func (p *parser) addElement() { |
|
p.addChild(&Node{ |
|
Type: ElementNode, |
|
DataAtom: p.tok.DataAtom, |
|
Data: p.tok.Data, |
|
Attr: p.tok.Attr, |
|
}) |
|
} |
|
|
|
// Section 12.2.3.3. |
|
func (p *parser) addFormattingElement() { |
|
tagAtom, attr := p.tok.DataAtom, p.tok.Attr |
|
p.addElement() |
|
|
|
// Implement the Noah's Ark clause, but with three per family instead of two. |
|
identicalElements := 0 |
|
findIdenticalElements: |
|
for i := len(p.afe) - 1; i >= 0; i-- { |
|
n := p.afe[i] |
|
if n.Type == scopeMarkerNode { |
|
break |
|
} |
|
if n.Type != ElementNode { |
|
continue |
|
} |
|
if n.Namespace != "" { |
|
continue |
|
} |
|
if n.DataAtom != tagAtom { |
|
continue |
|
} |
|
if len(n.Attr) != len(attr) { |
|
continue |
|
} |
|
compareAttributes: |
|
for _, t0 := range n.Attr { |
|
for _, t1 := range attr { |
|
if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val { |
|
// Found a match for this attribute, continue with the next attribute. |
|
continue compareAttributes |
|
} |
|
} |
|
// If we get here, there is no attribute that matches a. |
|
// Therefore the element is not identical to the new one. |
|
continue findIdenticalElements |
|
} |
|
|
|
identicalElements++ |
|
if identicalElements >= 3 { |
|
p.afe.remove(n) |
|
} |
|
} |
|
|
|
p.afe = append(p.afe, p.top()) |
|
} |
|
|
|
// Section 12.2.3.3. |
|
func (p *parser) clearActiveFormattingElements() { |
|
for { |
|
n := p.afe.pop() |
|
if len(p.afe) == 0 || n.Type == scopeMarkerNode { |
|
return |
|
} |
|
} |
|
} |
|
|
|
// Section 12.2.3.3. |
|
func (p *parser) reconstructActiveFormattingElements() { |
|
n := p.afe.top() |
|
if n == nil { |
|
return |
|
} |
|
if n.Type == scopeMarkerNode || p.oe.index(n) != -1 { |
|
return |
|
} |
|
i := len(p.afe) - 1 |
|
for n.Type != scopeMarkerNode && p.oe.index(n) == -1 { |
|
if i == 0 { |
|
i = -1 |
|
break |
|
} |
|
i-- |
|
n = p.afe[i] |
|
} |
|
for { |
|
i++ |
|
clone := p.afe[i].clone() |
|
p.addChild(clone) |
|
p.afe[i] = clone |
|
if i == len(p.afe)-1 { |
|
break |
|
} |
|
} |
|
} |
|
|
|
// Section 12.2.4. |
|
func (p *parser) acknowledgeSelfClosingTag() { |
|
p.hasSelfClosingToken = false |
|
} |
|
|
|
// An insertion mode (section 12.2.3.1) is the state transition function from |
|
// a particular state in the HTML5 parser's state machine. It updates the |
|
// parser's fields depending on parser.tok (where ErrorToken means EOF). |
|
// It returns whether the token was consumed. |
|
type insertionMode func(*parser) bool |
|
|
|
// setOriginalIM sets the insertion mode to return to after completing a text or |
|
// inTableText insertion mode. |
|
// Section 12.2.3.1, "using the rules for". |
|
func (p *parser) setOriginalIM() { |
|
if p.originalIM != nil { |
|
panic("html: bad parser state: originalIM was set twice") |
|
} |
|
p.originalIM = p.im |
|
} |
|
|
|
// Section 12.2.3.1, "reset the insertion mode". |
|
func (p *parser) resetInsertionMode() { |
|
for i := len(p.oe) - 1; i >= 0; i-- { |
|
n := p.oe[i] |
|
if i == 0 && p.context != nil { |
|
n = p.context |
|
} |
|
|
|
switch n.DataAtom { |
|
case a.Select: |
|
p.im = inSelectIM |
|
case a.Td, a.Th: |
|
p.im = inCellIM |
|
case a.Tr: |
|
p.im = inRowIM |
|
case a.Tbody, a.Thead, a.Tfoot: |
|
p.im = inTableBodyIM |
|
case a.Caption: |
|
p.im = inCaptionIM |
|
case a.Colgroup: |
|
p.im = inColumnGroupIM |
|
case a.Table: |
|
p.im = inTableIM |
|
case a.Head: |
|
p.im = inBodyIM |
|
case a.Body: |
|
p.im = inBodyIM |
|
case a.Frameset: |
|
p.im = inFramesetIM |
|
case a.Html: |
|
p.im = beforeHeadIM |
|
default: |
|
continue |
|
} |
|
return |
|
} |
|
p.im = inBodyIM |
|
} |
|
|
|
const whitespace = " \t\r\n\f" |
|
|
|
// Section 12.2.5.4.1. |
|
func initialIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case TextToken: |
|
p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) |
|
if len(p.tok.Data) == 0 { |
|
// It was all whitespace, so ignore it. |
|
return true |
|
} |
|
case CommentToken: |
|
p.doc.AppendChild(&Node{ |
|
Type: CommentNode, |
|
Data: p.tok.Data, |
|
}) |
|
return true |
|
case DoctypeToken: |
|
n, quirks := parseDoctype(p.tok.Data) |
|
p.doc.AppendChild(n) |
|
p.quirks = quirks |
|
p.im = beforeHTMLIM |
|
return true |
|
} |
|
p.quirks = true |
|
p.im = beforeHTMLIM |
|
return false |
|
} |
|
|
|
// Section 12.2.5.4.2. |
|
func beforeHTMLIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case DoctypeToken: |
|
// Ignore the token. |
|
return true |
|
case TextToken: |
|
p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) |
|
if len(p.tok.Data) == 0 { |
|
// It was all whitespace, so ignore it. |
|
return true |
|
} |
|
case StartTagToken: |
|
if p.tok.DataAtom == a.Html { |
|
p.addElement() |
|
p.im = beforeHeadIM |
|
return true |
|
} |
|
case EndTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Head, a.Body, a.Html, a.Br: |
|
p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) |
|
return false |
|
default: |
|
// Ignore the token. |
|
return true |
|
} |
|
case CommentToken: |
|
p.doc.AppendChild(&Node{ |
|
Type: CommentNode, |
|
Data: p.tok.Data, |
|
}) |
|
return true |
|
} |
|
p.parseImpliedToken(StartTagToken, a.Html, a.Html.String()) |
|
return false |
|
} |
|
|
|
// Section 12.2.5.4.3. |
|
func beforeHeadIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case TextToken: |
|
p.tok.Data = strings.TrimLeft(p.tok.Data, whitespace) |
|
if len(p.tok.Data) == 0 { |
|
// It was all whitespace, so ignore it. |
|
return true |
|
} |
|
case StartTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Head: |
|
p.addElement() |
|
p.head = p.top() |
|
p.im = inHeadIM |
|
return true |
|
case a.Html: |
|
return inBodyIM(p) |
|
} |
|
case EndTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Head, a.Body, a.Html, a.Br: |
|
p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) |
|
return false |
|
default: |
|
// Ignore the token. |
|
return true |
|
} |
|
case CommentToken: |
|
p.addChild(&Node{ |
|
Type: CommentNode, |
|
Data: p.tok.Data, |
|
}) |
|
return true |
|
case DoctypeToken: |
|
// Ignore the token. |
|
return true |
|
} |
|
|
|
p.parseImpliedToken(StartTagToken, a.Head, a.Head.String()) |
|
return false |
|
} |
|
|
|
// Section 12.2.5.4.4. |
|
func inHeadIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case TextToken: |
|
s := strings.TrimLeft(p.tok.Data, whitespace) |
|
if len(s) < len(p.tok.Data) { |
|
// Add the initial whitespace to the current node. |
|
p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) |
|
if s == "" { |
|
return true |
|
} |
|
p.tok.Data = s |
|
} |
|
case StartTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Html: |
|
return inBodyIM(p) |
|
case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta: |
|
p.addElement() |
|
p.oe.pop() |
|
p.acknowledgeSelfClosingTag() |
|
return true |
|
case a.Script, a.Title, a.Noscript, a.Noframes, a.Style: |
|
p.addElement() |
|
p.setOriginalIM() |
|
p.im = textIM |
|
return true |
|
case a.Head: |
|
// Ignore the token. |
|
return true |
|
} |
|
case EndTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Head: |
|
n := p.oe.pop() |
|
if n.DataAtom != a.Head { |
|
panic("html: bad parser state: <head> element not found, in the in-head insertion mode") |
|
} |
|
p.im = afterHeadIM |
|
return true |
|
case a.Body, a.Html, a.Br: |
|
p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) |
|
return false |
|
default: |
|
// Ignore the token. |
|
return true |
|
} |
|
case CommentToken: |
|
p.addChild(&Node{ |
|
Type: CommentNode, |
|
Data: p.tok.Data, |
|
}) |
|
return true |
|
case DoctypeToken: |
|
// Ignore the token. |
|
return true |
|
} |
|
|
|
p.parseImpliedToken(EndTagToken, a.Head, a.Head.String()) |
|
return false |
|
} |
|
|
|
// Section 12.2.5.4.6. |
|
func afterHeadIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case TextToken: |
|
s := strings.TrimLeft(p.tok.Data, whitespace) |
|
if len(s) < len(p.tok.Data) { |
|
// Add the initial whitespace to the current node. |
|
p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) |
|
if s == "" { |
|
return true |
|
} |
|
p.tok.Data = s |
|
} |
|
case StartTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Html: |
|
return inBodyIM(p) |
|
case a.Body: |
|
p.addElement() |
|
p.framesetOK = false |
|
p.im = inBodyIM |
|
return true |
|
case a.Frameset: |
|
p.addElement() |
|
p.im = inFramesetIM |
|
return true |
|
case a.Base, a.Basefont, a.Bgsound, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Title: |
|
p.oe = append(p.oe, p.head) |
|
defer p.oe.remove(p.head) |
|
return inHeadIM(p) |
|
case a.Head: |
|
// Ignore the token. |
|
return true |
|
} |
|
case EndTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Body, a.Html, a.Br: |
|
// Drop down to creating an implied <body> tag. |
|
default: |
|
// Ignore the token. |
|
return true |
|
} |
|
case CommentToken: |
|
p.addChild(&Node{ |
|
Type: CommentNode, |
|
Data: p.tok.Data, |
|
}) |
|
return true |
|
case DoctypeToken: |
|
// Ignore the token. |
|
return true |
|
} |
|
|
|
p.parseImpliedToken(StartTagToken, a.Body, a.Body.String()) |
|
p.framesetOK = true |
|
return false |
|
} |
|
|
|
// copyAttributes copies attributes of src not found on dst to dst. |
|
func copyAttributes(dst *Node, src Token) { |
|
if len(src.Attr) == 0 { |
|
return |
|
} |
|
attr := map[string]string{} |
|
for _, t := range dst.Attr { |
|
attr[t.Key] = t.Val |
|
} |
|
for _, t := range src.Attr { |
|
if _, ok := attr[t.Key]; !ok { |
|
dst.Attr = append(dst.Attr, t) |
|
attr[t.Key] = t.Val |
|
} |
|
} |
|
} |
|
|
|
// Section 12.2.5.4.7. |
|
func inBodyIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case TextToken: |
|
d := p.tok.Data |
|
switch n := p.oe.top(); n.DataAtom { |
|
case a.Pre, a.Listing: |
|
if n.FirstChild == nil { |
|
// Ignore a newline at the start of a <pre> block. |
|
if d != "" && d[0] == '\r' { |
|
d = d[1:] |
|
} |
|
if d != "" && d[0] == '\n' { |
|
d = d[1:] |
|
} |
|
} |
|
} |
|
d = strings.Replace(d, "\x00", "", -1) |
|
if d == "" { |
|
return true |
|
} |
|
p.reconstructActiveFormattingElements() |
|
p.addText(d) |
|
if p.framesetOK && strings.TrimLeft(d, whitespace) != "" { |
|
// There were non-whitespace characters inserted. |
|
p.framesetOK = false |
|
} |
|
case StartTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Html: |
|
copyAttributes(p.oe[0], p.tok) |
|
case a.Base, a.Basefont, a.Bgsound, a.Command, a.Link, a.Meta, a.Noframes, a.Script, a.Style, a.Title: |
|
return inHeadIM(p) |
|
case a.Body: |
|
if len(p.oe) >= 2 { |
|
body := p.oe[1] |
|
if body.Type == ElementNode && body.DataAtom == a.Body { |
|
p.framesetOK = false |
|
copyAttributes(body, p.tok) |
|
} |
|
} |
|
case a.Frameset: |
|
if !p.framesetOK || len(p.oe) < 2 || p.oe[1].DataAtom != a.Body { |
|
// Ignore the token. |
|
return true |
|
} |
|
body := p.oe[1] |
|
if body.Parent != nil { |
|
body.Parent.RemoveChild(body) |
|
} |
|
p.oe = p.oe[:1] |
|
p.addElement() |
|
p.im = inFramesetIM |
|
return true |
|
case a.Address, a.Article, a.Aside, a.Blockquote, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Menu, a.Nav, a.Ol, a.P, a.Section, a.Summary, a.Ul: |
|
p.popUntil(buttonScope, a.P) |
|
p.addElement() |
|
case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: |
|
p.popUntil(buttonScope, a.P) |
|
switch n := p.top(); n.DataAtom { |
|
case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: |
|
p.oe.pop() |
|
} |
|
p.addElement() |
|
case a.Pre, a.Listing: |
|
p.popUntil(buttonScope, a.P) |
|
p.addElement() |
|
// The newline, if any, will be dealt with by the TextToken case. |
|
p.framesetOK = false |
|
case a.Form: |
|
if p.form == nil { |
|
p.popUntil(buttonScope, a.P) |
|
p.addElement() |
|
p.form = p.top() |
|
} |
|
case a.Li: |
|
p.framesetOK = false |
|
for i := len(p.oe) - 1; i >= 0; i-- { |
|
node := p.oe[i] |
|
switch node.DataAtom { |
|
case a.Li: |
|
p.oe = p.oe[:i] |
|
case a.Address, a.Div, a.P: |
|
continue |
|
default: |
|
if !isSpecialElement(node) { |
|
continue |
|
} |
|
} |
|
break |
|
} |
|
p.popUntil(buttonScope, a.P) |
|
p.addElement() |
|
case a.Dd, a.Dt: |
|
p.framesetOK = false |
|
for i := len(p.oe) - 1; i >= 0; i-- { |
|
node := p.oe[i] |
|
switch node.DataAtom { |
|
case a.Dd, a.Dt: |
|
p.oe = p.oe[:i] |
|
case a.Address, a.Div, a.P: |
|
continue |
|
default: |
|
if !isSpecialElement(node) { |
|
continue |
|
} |
|
} |
|
break |
|
} |
|
p.popUntil(buttonScope, a.P) |
|
p.addElement() |
|
case a.Plaintext: |
|
p.popUntil(buttonScope, a.P) |
|
p.addElement() |
|
case a.Button: |
|
p.popUntil(defaultScope, a.Button) |
|
p.reconstructActiveFormattingElements() |
|
p.addElement() |
|
p.framesetOK = false |
|
case a.A: |
|
for i := len(p.afe) - 1; i >= 0 && p.afe[i].Type != scopeMarkerNode; i-- { |
|
if n := p.afe[i]; n.Type == ElementNode && n.DataAtom == a.A { |
|
p.inBodyEndTagFormatting(a.A) |
|
p.oe.remove(n) |
|
p.afe.remove(n) |
|
break |
|
} |
|
} |
|
p.reconstructActiveFormattingElements() |
|
p.addFormattingElement() |
|
case a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: |
|
p.reconstructActiveFormattingElements() |
|
p.addFormattingElement() |
|
case a.Nobr: |
|
p.reconstructActiveFormattingElements() |
|
if p.elementInScope(defaultScope, a.Nobr) { |
|
p.inBodyEndTagFormatting(a.Nobr) |
|
p.reconstructActiveFormattingElements() |
|
} |
|
p.addFormattingElement() |
|
case a.Applet, a.Marquee, a.Object: |
|
p.reconstructActiveFormattingElements() |
|
p.addElement() |
|
p.afe = append(p.afe, &scopeMarker) |
|
p.framesetOK = false |
|
case a.Table: |
|
if !p.quirks { |
|
p.popUntil(buttonScope, a.P) |
|
} |
|
p.addElement() |
|
p.framesetOK = false |
|
p.im = inTableIM |
|
return true |
|
case a.Area, a.Br, a.Embed, a.Img, a.Input, a.Keygen, a.Wbr: |
|
p.reconstructActiveFormattingElements() |
|
p.addElement() |
|
p.oe.pop() |
|
p.acknowledgeSelfClosingTag() |
|
if p.tok.DataAtom == a.Input { |
|
for _, t := range p.tok.Attr { |
|
if t.Key == "type" { |
|
if strings.ToLower(t.Val) == "hidden" { |
|
// Skip setting framesetOK = false |
|
return true |
|
} |
|
} |
|
} |
|
} |
|
p.framesetOK = false |
|
case a.Param, a.Source, a.Track: |
|
p.addElement() |
|
p.oe.pop() |
|
p.acknowledgeSelfClosingTag() |
|
case a.Hr: |
|
p.popUntil(buttonScope, a.P) |
|
p.addElement() |
|
p.oe.pop() |
|
p.acknowledgeSelfClosingTag() |
|
p.framesetOK = false |
|
case a.Image: |
|
p.tok.DataAtom = a.Img |
|
p.tok.Data = a.Img.String() |
|
return false |
|
case a.Isindex: |
|
if p.form != nil { |
|
// Ignore the token. |
|
return true |
|
} |
|
action := "" |
|
prompt := "This is a searchable index. Enter search keywords: " |
|
attr := []Attribute{{Key: "name", Val: "isindex"}} |
|
for _, t := range p.tok.Attr { |
|
switch t.Key { |
|
case "action": |
|
action = t.Val |
|
case "name": |
|
// Ignore the attribute. |
|
case "prompt": |
|
prompt = t.Val |
|
default: |
|
attr = append(attr, t) |
|
} |
|
} |
|
p.acknowledgeSelfClosingTag() |
|
p.popUntil(buttonScope, a.P) |
|
p.parseImpliedToken(StartTagToken, a.Form, a.Form.String()) |
|
if action != "" { |
|
p.form.Attr = []Attribute{{Key: "action", Val: action}} |
|
} |
|
p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String()) |
|
p.parseImpliedToken(StartTagToken, a.Label, a.Label.String()) |
|
p.addText(prompt) |
|
p.addChild(&Node{ |
|
Type: ElementNode, |
|
DataAtom: a.Input, |
|
Data: a.Input.String(), |
|
Attr: attr, |
|
}) |
|
p.oe.pop() |
|
p.parseImpliedToken(EndTagToken, a.Label, a.Label.String()) |
|
p.parseImpliedToken(StartTagToken, a.Hr, a.Hr.String()) |
|
p.parseImpliedToken(EndTagToken, a.Form, a.Form.String()) |
|
case a.Textarea: |
|
p.addElement() |
|
p.setOriginalIM() |
|
p.framesetOK = false |
|
p.im = textIM |
|
case a.Xmp: |
|
p.popUntil(buttonScope, a.P) |
|
p.reconstructActiveFormattingElements() |
|
p.framesetOK = false |
|
p.addElement() |
|
p.setOriginalIM() |
|
p.im = textIM |
|
case a.Iframe: |
|
p.framesetOK = false |
|
p.addElement() |
|
p.setOriginalIM() |
|
p.im = textIM |
|
case a.Noembed, a.Noscript: |
|
p.addElement() |
|
p.setOriginalIM() |
|
p.im = textIM |
|
case a.Select: |
|
p.reconstructActiveFormattingElements() |
|
p.addElement() |
|
p.framesetOK = false |
|
p.im = inSelectIM |
|
return true |
|
case a.Optgroup, a.Option: |
|
if p.top().DataAtom == a.Option { |
|
p.oe.pop() |
|
} |
|
p.reconstructActiveFormattingElements() |
|
p.addElement() |
|
case a.Rp, a.Rt: |
|
if p.elementInScope(defaultScope, a.Ruby) { |
|
p.generateImpliedEndTags() |
|
} |
|
p.addElement() |
|
case a.Math, a.Svg: |
|
p.reconstructActiveFormattingElements() |
|
if p.tok.DataAtom == a.Math { |
|
adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) |
|
} else { |
|
adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) |
|
} |
|
adjustForeignAttributes(p.tok.Attr) |
|
p.addElement() |
|
p.top().Namespace = p.tok.Data |
|
if p.hasSelfClosingToken { |
|
p.oe.pop() |
|
p.acknowledgeSelfClosingTag() |
|
} |
|
return true |
|
case a.Caption, a.Col, a.Colgroup, a.Frame, a.Head, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: |
|
// Ignore the token. |
|
default: |
|
p.reconstructActiveFormattingElements() |
|
p.addElement() |
|
} |
|
case EndTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Body: |
|
if p.elementInScope(defaultScope, a.Body) { |
|
p.im = afterBodyIM |
|
} |
|
case a.Html: |
|
if p.elementInScope(defaultScope, a.Body) { |
|
p.parseImpliedToken(EndTagToken, a.Body, a.Body.String()) |
|
return false |
|
} |
|
return true |
|
case a.Address, a.Article, a.Aside, a.Blockquote, a.Button, a.Center, a.Details, a.Dir, a.Div, a.Dl, a.Fieldset, a.Figcaption, a.Figure, a.Footer, a.Header, a.Hgroup, a.Listing, a.Menu, a.Nav, a.Ol, a.Pre, a.Section, a.Summary, a.Ul: |
|
p.popUntil(defaultScope, p.tok.DataAtom) |
|
case a.Form: |
|
node := p.form |
|
p.form = nil |
|
i := p.indexOfElementInScope(defaultScope, a.Form) |
|
if node == nil || i == -1 || p.oe[i] != node { |
|
// Ignore the token. |
|
return true |
|
} |
|
p.generateImpliedEndTags() |
|
p.oe.remove(node) |
|
case a.P: |
|
if !p.elementInScope(buttonScope, a.P) { |
|
p.parseImpliedToken(StartTagToken, a.P, a.P.String()) |
|
} |
|
p.popUntil(buttonScope, a.P) |
|
case a.Li: |
|
p.popUntil(listItemScope, a.Li) |
|
case a.Dd, a.Dt: |
|
p.popUntil(defaultScope, p.tok.DataAtom) |
|
case a.H1, a.H2, a.H3, a.H4, a.H5, a.H6: |
|
p.popUntil(defaultScope, a.H1, a.H2, a.H3, a.H4, a.H5, a.H6) |
|
case a.A, a.B, a.Big, a.Code, a.Em, a.Font, a.I, a.Nobr, a.S, a.Small, a.Strike, a.Strong, a.Tt, a.U: |
|
p.inBodyEndTagFormatting(p.tok.DataAtom) |
|
case a.Applet, a.Marquee, a.Object: |
|
if p.popUntil(defaultScope, p.tok.DataAtom) { |
|
p.clearActiveFormattingElements() |
|
} |
|
case a.Br: |
|
p.tok.Type = StartTagToken |
|
return false |
|
default: |
|
p.inBodyEndTagOther(p.tok.DataAtom) |
|
} |
|
case CommentToken: |
|
p.addChild(&Node{ |
|
Type: CommentNode, |
|
Data: p.tok.Data, |
|
}) |
|
} |
|
|
|
return true |
|
} |
|
|
|
func (p *parser) inBodyEndTagFormatting(tagAtom a.Atom) { |
|
// This is the "adoption agency" algorithm, described at |
|
// https://html.spec.whatwg.org/multipage/syntax.html#adoptionAgency |
|
|
|
// TODO: this is a fairly literal line-by-line translation of that algorithm. |
|
// Once the code successfully parses the comprehensive test suite, we should |
|
// refactor this code to be more idiomatic. |
|
|
|
// Steps 1-4. The outer loop. |
|
for i := 0; i < 8; i++ { |
|
// Step 5. Find the formatting element. |
|
var formattingElement *Node |
|
for j := len(p.afe) - 1; j >= 0; j-- { |
|
if p.afe[j].Type == scopeMarkerNode { |
|
break |
|
} |
|
if p.afe[j].DataAtom == tagAtom { |
|
formattingElement = p.afe[j] |
|
break |
|
} |
|
} |
|
if formattingElement == nil { |
|
p.inBodyEndTagOther(tagAtom) |
|
return |
|
} |
|
feIndex := p.oe.index(formattingElement) |
|
if feIndex == -1 { |
|
p.afe.remove(formattingElement) |
|
return |
|
} |
|
if !p.elementInScope(defaultScope, tagAtom) { |
|
// Ignore the tag. |
|
return |
|
} |
|
|
|
// Steps 9-10. Find the furthest block. |
|
var furthestBlock *Node |
|
for _, e := range p.oe[feIndex:] { |
|
if isSpecialElement(e) { |
|
furthestBlock = e |
|
break |
|
} |
|
} |
|
if furthestBlock == nil { |
|
e := p.oe.pop() |
|
for e != formattingElement { |
|
e = p.oe.pop() |
|
} |
|
p.afe.remove(e) |
|
return |
|
} |
|
|
|
// Steps 11-12. Find the common ancestor and bookmark node. |
|
commonAncestor := p.oe[feIndex-1] |
|
bookmark := p.afe.index(formattingElement) |
|
|
|
// Step 13. The inner loop. Find the lastNode to reparent. |
|
lastNode := furthestBlock |
|
node := furthestBlock |
|
x := p.oe.index(node) |
|
// Steps 13.1-13.2 |
|
for j := 0; j < 3; j++ { |
|
// Step 13.3. |
|
x-- |
|
node = p.oe[x] |
|
// Step 13.4 - 13.5. |
|
if p.afe.index(node) == -1 { |
|
p.oe.remove(node) |
|
continue |
|
} |
|
// Step 13.6. |
|
if node == formattingElement { |
|
break |
|
} |
|
// Step 13.7. |
|
clone := node.clone() |
|
p.afe[p.afe.index(node)] = clone |
|
p.oe[p.oe.index(node)] = clone |
|
node = clone |
|
// Step 13.8. |
|
if lastNode == furthestBlock { |
|
bookmark = p.afe.index(node) + 1 |
|
} |
|
// Step 13.9. |
|
if lastNode.Parent != nil { |
|
lastNode.Parent.RemoveChild(lastNode) |
|
} |
|
node.AppendChild(lastNode) |
|
// Step 13.10. |
|
lastNode = node |
|
} |
|
|
|
// Step 14. Reparent lastNode to the common ancestor, |
|
// or for misnested table nodes, to the foster parent. |
|
if lastNode.Parent != nil { |
|
lastNode.Parent.RemoveChild(lastNode) |
|
} |
|
switch commonAncestor.DataAtom { |
|
case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: |
|
p.fosterParent(lastNode) |
|
default: |
|
commonAncestor.AppendChild(lastNode) |
|
} |
|
|
|
// Steps 15-17. Reparent nodes from the furthest block's children |
|
// to a clone of the formatting element. |
|
clone := formattingElement.clone() |
|
reparentChildren(clone, furthestBlock) |
|
furthestBlock.AppendChild(clone) |
|
|
|
// Step 18. Fix up the list of active formatting elements. |
|
if oldLoc := p.afe.index(formattingElement); oldLoc != -1 && oldLoc < bookmark { |
|
// Move the bookmark with the rest of the list. |
|
bookmark-- |
|
} |
|
p.afe.remove(formattingElement) |
|
p.afe.insert(bookmark, clone) |
|
|
|
// Step 19. Fix up the stack of open elements. |
|
p.oe.remove(formattingElement) |
|
p.oe.insert(p.oe.index(furthestBlock)+1, clone) |
|
} |
|
} |
|
|
|
// inBodyEndTagOther performs the "any other end tag" algorithm for inBodyIM. |
|
// "Any other end tag" handling from 12.2.5.5 The rules for parsing tokens in foreign content |
|
// https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inforeign |
|
func (p *parser) inBodyEndTagOther(tagAtom a.Atom) { |
|
for i := len(p.oe) - 1; i >= 0; i-- { |
|
if p.oe[i].DataAtom == tagAtom { |
|
p.oe = p.oe[:i] |
|
break |
|
} |
|
if isSpecialElement(p.oe[i]) { |
|
break |
|
} |
|
} |
|
} |
|
|
|
// Section 12.2.5.4.8. |
|
func textIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case ErrorToken: |
|
p.oe.pop() |
|
case TextToken: |
|
d := p.tok.Data |
|
if n := p.oe.top(); n.DataAtom == a.Textarea && n.FirstChild == nil { |
|
// Ignore a newline at the start of a <textarea> block. |
|
if d != "" && d[0] == '\r' { |
|
d = d[1:] |
|
} |
|
if d != "" && d[0] == '\n' { |
|
d = d[1:] |
|
} |
|
} |
|
if d == "" { |
|
return true |
|
} |
|
p.addText(d) |
|
return true |
|
case EndTagToken: |
|
p.oe.pop() |
|
} |
|
p.im = p.originalIM |
|
p.originalIM = nil |
|
return p.tok.Type == EndTagToken |
|
} |
|
|
|
// Section 12.2.5.4.9. |
|
func inTableIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case ErrorToken: |
|
// Stop parsing. |
|
return true |
|
case TextToken: |
|
p.tok.Data = strings.Replace(p.tok.Data, "\x00", "", -1) |
|
switch p.oe.top().DataAtom { |
|
case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: |
|
if strings.Trim(p.tok.Data, whitespace) == "" { |
|
p.addText(p.tok.Data) |
|
return true |
|
} |
|
} |
|
case StartTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Caption: |
|
p.clearStackToContext(tableScope) |
|
p.afe = append(p.afe, &scopeMarker) |
|
p.addElement() |
|
p.im = inCaptionIM |
|
return true |
|
case a.Colgroup: |
|
p.clearStackToContext(tableScope) |
|
p.addElement() |
|
p.im = inColumnGroupIM |
|
return true |
|
case a.Col: |
|
p.parseImpliedToken(StartTagToken, a.Colgroup, a.Colgroup.String()) |
|
return false |
|
case a.Tbody, a.Tfoot, a.Thead: |
|
p.clearStackToContext(tableScope) |
|
p.addElement() |
|
p.im = inTableBodyIM |
|
return true |
|
case a.Td, a.Th, a.Tr: |
|
p.parseImpliedToken(StartTagToken, a.Tbody, a.Tbody.String()) |
|
return false |
|
case a.Table: |
|
if p.popUntil(tableScope, a.Table) { |
|
p.resetInsertionMode() |
|
return false |
|
} |
|
// Ignore the token. |
|
return true |
|
case a.Style, a.Script: |
|
return inHeadIM(p) |
|
case a.Input: |
|
for _, t := range p.tok.Attr { |
|
if t.Key == "type" && strings.ToLower(t.Val) == "hidden" { |
|
p.addElement() |
|
p.oe.pop() |
|
return true |
|
} |
|
} |
|
// Otherwise drop down to the default action. |
|
case a.Form: |
|
if p.form != nil { |
|
// Ignore the token. |
|
return true |
|
} |
|
p.addElement() |
|
p.form = p.oe.pop() |
|
case a.Select: |
|
p.reconstructActiveFormattingElements() |
|
switch p.top().DataAtom { |
|
case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: |
|
p.fosterParenting = true |
|
} |
|
p.addElement() |
|
p.fosterParenting = false |
|
p.framesetOK = false |
|
p.im = inSelectInTableIM |
|
return true |
|
} |
|
case EndTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Table: |
|
if p.popUntil(tableScope, a.Table) { |
|
p.resetInsertionMode() |
|
return true |
|
} |
|
// Ignore the token. |
|
return true |
|
case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: |
|
// Ignore the token. |
|
return true |
|
} |
|
case CommentToken: |
|
p.addChild(&Node{ |
|
Type: CommentNode, |
|
Data: p.tok.Data, |
|
}) |
|
return true |
|
case DoctypeToken: |
|
// Ignore the token. |
|
return true |
|
} |
|
|
|
p.fosterParenting = true |
|
defer func() { p.fosterParenting = false }() |
|
|
|
return inBodyIM(p) |
|
} |
|
|
|
// Section 12.2.5.4.11. |
|
func inCaptionIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case StartTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Thead, a.Tr: |
|
if p.popUntil(tableScope, a.Caption) { |
|
p.clearActiveFormattingElements() |
|
p.im = inTableIM |
|
return false |
|
} else { |
|
// Ignore the token. |
|
return true |
|
} |
|
case a.Select: |
|
p.reconstructActiveFormattingElements() |
|
p.addElement() |
|
p.framesetOK = false |
|
p.im = inSelectInTableIM |
|
return true |
|
} |
|
case EndTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Caption: |
|
if p.popUntil(tableScope, a.Caption) { |
|
p.clearActiveFormattingElements() |
|
p.im = inTableIM |
|
} |
|
return true |
|
case a.Table: |
|
if p.popUntil(tableScope, a.Caption) { |
|
p.clearActiveFormattingElements() |
|
p.im = inTableIM |
|
return false |
|
} else { |
|
// Ignore the token. |
|
return true |
|
} |
|
case a.Body, a.Col, a.Colgroup, a.Html, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: |
|
// Ignore the token. |
|
return true |
|
} |
|
} |
|
return inBodyIM(p) |
|
} |
|
|
|
// Section 12.2.5.4.12. |
|
func inColumnGroupIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case TextToken: |
|
s := strings.TrimLeft(p.tok.Data, whitespace) |
|
if len(s) < len(p.tok.Data) { |
|
// Add the initial whitespace to the current node. |
|
p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) |
|
if s == "" { |
|
return true |
|
} |
|
p.tok.Data = s |
|
} |
|
case CommentToken: |
|
p.addChild(&Node{ |
|
Type: CommentNode, |
|
Data: p.tok.Data, |
|
}) |
|
return true |
|
case DoctypeToken: |
|
// Ignore the token. |
|
return true |
|
case StartTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Html: |
|
return inBodyIM(p) |
|
case a.Col: |
|
p.addElement() |
|
p.oe.pop() |
|
p.acknowledgeSelfClosingTag() |
|
return true |
|
} |
|
case EndTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Colgroup: |
|
if p.oe.top().DataAtom != a.Html { |
|
p.oe.pop() |
|
p.im = inTableIM |
|
} |
|
return true |
|
case a.Col: |
|
// Ignore the token. |
|
return true |
|
} |
|
} |
|
if p.oe.top().DataAtom != a.Html { |
|
p.oe.pop() |
|
p.im = inTableIM |
|
return false |
|
} |
|
return true |
|
} |
|
|
|
// Section 12.2.5.4.13. |
|
func inTableBodyIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case StartTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Tr: |
|
p.clearStackToContext(tableBodyScope) |
|
p.addElement() |
|
p.im = inRowIM |
|
return true |
|
case a.Td, a.Th: |
|
p.parseImpliedToken(StartTagToken, a.Tr, a.Tr.String()) |
|
return false |
|
case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead: |
|
if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { |
|
p.im = inTableIM |
|
return false |
|
} |
|
// Ignore the token. |
|
return true |
|
} |
|
case EndTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Tbody, a.Tfoot, a.Thead: |
|
if p.elementInScope(tableScope, p.tok.DataAtom) { |
|
p.clearStackToContext(tableBodyScope) |
|
p.oe.pop() |
|
p.im = inTableIM |
|
} |
|
return true |
|
case a.Table: |
|
if p.popUntil(tableScope, a.Tbody, a.Thead, a.Tfoot) { |
|
p.im = inTableIM |
|
return false |
|
} |
|
// Ignore the token. |
|
return true |
|
case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th, a.Tr: |
|
// Ignore the token. |
|
return true |
|
} |
|
case CommentToken: |
|
p.addChild(&Node{ |
|
Type: CommentNode, |
|
Data: p.tok.Data, |
|
}) |
|
return true |
|
} |
|
|
|
return inTableIM(p) |
|
} |
|
|
|
// Section 12.2.5.4.14. |
|
func inRowIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case StartTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Td, a.Th: |
|
p.clearStackToContext(tableRowScope) |
|
p.addElement() |
|
p.afe = append(p.afe, &scopeMarker) |
|
p.im = inCellIM |
|
return true |
|
case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Tfoot, a.Thead, a.Tr: |
|
if p.popUntil(tableScope, a.Tr) { |
|
p.im = inTableBodyIM |
|
return false |
|
} |
|
// Ignore the token. |
|
return true |
|
} |
|
case EndTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Tr: |
|
if p.popUntil(tableScope, a.Tr) { |
|
p.im = inTableBodyIM |
|
return true |
|
} |
|
// Ignore the token. |
|
return true |
|
case a.Table: |
|
if p.popUntil(tableScope, a.Tr) { |
|
p.im = inTableBodyIM |
|
return false |
|
} |
|
// Ignore the token. |
|
return true |
|
case a.Tbody, a.Tfoot, a.Thead: |
|
if p.elementInScope(tableScope, p.tok.DataAtom) { |
|
p.parseImpliedToken(EndTagToken, a.Tr, a.Tr.String()) |
|
return false |
|
} |
|
// Ignore the token. |
|
return true |
|
case a.Body, a.Caption, a.Col, a.Colgroup, a.Html, a.Td, a.Th: |
|
// Ignore the token. |
|
return true |
|
} |
|
} |
|
|
|
return inTableIM(p) |
|
} |
|
|
|
// Section 12.2.5.4.15. |
|
func inCellIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case StartTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Caption, a.Col, a.Colgroup, a.Tbody, a.Td, a.Tfoot, a.Th, a.Thead, a.Tr: |
|
if p.popUntil(tableScope, a.Td, a.Th) { |
|
// Close the cell and reprocess. |
|
p.clearActiveFormattingElements() |
|
p.im = inRowIM |
|
return false |
|
} |
|
// Ignore the token. |
|
return true |
|
case a.Select: |
|
p.reconstructActiveFormattingElements() |
|
p.addElement() |
|
p.framesetOK = false |
|
p.im = inSelectInTableIM |
|
return true |
|
} |
|
case EndTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Td, a.Th: |
|
if !p.popUntil(tableScope, p.tok.DataAtom) { |
|
// Ignore the token. |
|
return true |
|
} |
|
p.clearActiveFormattingElements() |
|
p.im = inRowIM |
|
return true |
|
case a.Body, a.Caption, a.Col, a.Colgroup, a.Html: |
|
// Ignore the token. |
|
return true |
|
case a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr: |
|
if !p.elementInScope(tableScope, p.tok.DataAtom) { |
|
// Ignore the token. |
|
return true |
|
} |
|
// Close the cell and reprocess. |
|
p.popUntil(tableScope, a.Td, a.Th) |
|
p.clearActiveFormattingElements() |
|
p.im = inRowIM |
|
return false |
|
} |
|
} |
|
return inBodyIM(p) |
|
} |
|
|
|
// Section 12.2.5.4.16. |
|
func inSelectIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case ErrorToken: |
|
// Stop parsing. |
|
return true |
|
case TextToken: |
|
p.addText(strings.Replace(p.tok.Data, "\x00", "", -1)) |
|
case StartTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Html: |
|
return inBodyIM(p) |
|
case a.Option: |
|
if p.top().DataAtom == a.Option { |
|
p.oe.pop() |
|
} |
|
p.addElement() |
|
case a.Optgroup: |
|
if p.top().DataAtom == a.Option { |
|
p.oe.pop() |
|
} |
|
if p.top().DataAtom == a.Optgroup { |
|
p.oe.pop() |
|
} |
|
p.addElement() |
|
case a.Select: |
|
p.tok.Type = EndTagToken |
|
return false |
|
case a.Input, a.Keygen, a.Textarea: |
|
if p.elementInScope(selectScope, a.Select) { |
|
p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) |
|
return false |
|
} |
|
// In order to properly ignore <textarea>, we need to change the tokenizer mode. |
|
p.tokenizer.NextIsNotRawText() |
|
// Ignore the token. |
|
return true |
|
case a.Script: |
|
return inHeadIM(p) |
|
} |
|
case EndTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Option: |
|
if p.top().DataAtom == a.Option { |
|
p.oe.pop() |
|
} |
|
case a.Optgroup: |
|
i := len(p.oe) - 1 |
|
if p.oe[i].DataAtom == a.Option { |
|
i-- |
|
} |
|
if p.oe[i].DataAtom == a.Optgroup { |
|
p.oe = p.oe[:i] |
|
} |
|
case a.Select: |
|
if p.popUntil(selectScope, a.Select) { |
|
p.resetInsertionMode() |
|
} |
|
} |
|
case CommentToken: |
|
p.addChild(&Node{ |
|
Type: CommentNode, |
|
Data: p.tok.Data, |
|
}) |
|
case DoctypeToken: |
|
// Ignore the token. |
|
return true |
|
} |
|
|
|
return true |
|
} |
|
|
|
// Section 12.2.5.4.17. |
|
func inSelectInTableIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case StartTagToken, EndTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Caption, a.Table, a.Tbody, a.Tfoot, a.Thead, a.Tr, a.Td, a.Th: |
|
if p.tok.Type == StartTagToken || p.elementInScope(tableScope, p.tok.DataAtom) { |
|
p.parseImpliedToken(EndTagToken, a.Select, a.Select.String()) |
|
return false |
|
} else { |
|
// Ignore the token. |
|
return true |
|
} |
|
} |
|
} |
|
return inSelectIM(p) |
|
} |
|
|
|
// Section 12.2.5.4.18. |
|
func afterBodyIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case ErrorToken: |
|
// Stop parsing. |
|
return true |
|
case TextToken: |
|
s := strings.TrimLeft(p.tok.Data, whitespace) |
|
if len(s) == 0 { |
|
// It was all whitespace. |
|
return inBodyIM(p) |
|
} |
|
case StartTagToken: |
|
if p.tok.DataAtom == a.Html { |
|
return inBodyIM(p) |
|
} |
|
case EndTagToken: |
|
if p.tok.DataAtom == a.Html { |
|
if !p.fragment { |
|
p.im = afterAfterBodyIM |
|
} |
|
return true |
|
} |
|
case CommentToken: |
|
// The comment is attached to the <html> element. |
|
if len(p.oe) < 1 || p.oe[0].DataAtom != a.Html { |
|
panic("html: bad parser state: <html> element not found, in the after-body insertion mode") |
|
} |
|
p.oe[0].AppendChild(&Node{ |
|
Type: CommentNode, |
|
Data: p.tok.Data, |
|
}) |
|
return true |
|
} |
|
p.im = inBodyIM |
|
return false |
|
} |
|
|
|
// Section 12.2.5.4.19. |
|
func inFramesetIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case CommentToken: |
|
p.addChild(&Node{ |
|
Type: CommentNode, |
|
Data: p.tok.Data, |
|
}) |
|
case TextToken: |
|
// Ignore all text but whitespace. |
|
s := strings.Map(func(c rune) rune { |
|
switch c { |
|
case ' ', '\t', '\n', '\f', '\r': |
|
return c |
|
} |
|
return -1 |
|
}, p.tok.Data) |
|
if s != "" { |
|
p.addText(s) |
|
} |
|
case StartTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Html: |
|
return inBodyIM(p) |
|
case a.Frameset: |
|
p.addElement() |
|
case a.Frame: |
|
p.addElement() |
|
p.oe.pop() |
|
p.acknowledgeSelfClosingTag() |
|
case a.Noframes: |
|
return inHeadIM(p) |
|
} |
|
case EndTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Frameset: |
|
if p.oe.top().DataAtom != a.Html { |
|
p.oe.pop() |
|
if p.oe.top().DataAtom != a.Frameset { |
|
p.im = afterFramesetIM |
|
return true |
|
} |
|
} |
|
} |
|
default: |
|
// Ignore the token. |
|
} |
|
return true |
|
} |
|
|
|
// Section 12.2.5.4.20. |
|
func afterFramesetIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case CommentToken: |
|
p.addChild(&Node{ |
|
Type: CommentNode, |
|
Data: p.tok.Data, |
|
}) |
|
case TextToken: |
|
// Ignore all text but whitespace. |
|
s := strings.Map(func(c rune) rune { |
|
switch c { |
|
case ' ', '\t', '\n', '\f', '\r': |
|
return c |
|
} |
|
return -1 |
|
}, p.tok.Data) |
|
if s != "" { |
|
p.addText(s) |
|
} |
|
case StartTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Html: |
|
return inBodyIM(p) |
|
case a.Noframes: |
|
return inHeadIM(p) |
|
} |
|
case EndTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Html: |
|
p.im = afterAfterFramesetIM |
|
return true |
|
} |
|
default: |
|
// Ignore the token. |
|
} |
|
return true |
|
} |
|
|
|
// Section 12.2.5.4.21. |
|
func afterAfterBodyIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case ErrorToken: |
|
// Stop parsing. |
|
return true |
|
case TextToken: |
|
s := strings.TrimLeft(p.tok.Data, whitespace) |
|
if len(s) == 0 { |
|
// It was all whitespace. |
|
return inBodyIM(p) |
|
} |
|
case StartTagToken: |
|
if p.tok.DataAtom == a.Html { |
|
return inBodyIM(p) |
|
} |
|
case CommentToken: |
|
p.doc.AppendChild(&Node{ |
|
Type: CommentNode, |
|
Data: p.tok.Data, |
|
}) |
|
return true |
|
case DoctypeToken: |
|
return inBodyIM(p) |
|
} |
|
p.im = inBodyIM |
|
return false |
|
} |
|
|
|
// Section 12.2.5.4.22. |
|
func afterAfterFramesetIM(p *parser) bool { |
|
switch p.tok.Type { |
|
case CommentToken: |
|
p.doc.AppendChild(&Node{ |
|
Type: CommentNode, |
|
Data: p.tok.Data, |
|
}) |
|
case TextToken: |
|
// Ignore all text but whitespace. |
|
s := strings.Map(func(c rune) rune { |
|
switch c { |
|
case ' ', '\t', '\n', '\f', '\r': |
|
return c |
|
} |
|
return -1 |
|
}, p.tok.Data) |
|
if s != "" { |
|
p.tok.Data = s |
|
return inBodyIM(p) |
|
} |
|
case StartTagToken: |
|
switch p.tok.DataAtom { |
|
case a.Html: |
|
return inBodyIM(p) |
|
case a.Noframes: |
|
return inHeadIM(p) |
|
} |
|
case DoctypeToken: |
|
return inBodyIM(p) |
|
default: |
|
// Ignore the token. |
|
} |
|
return true |
|
} |
|
|
|
const whitespaceOrNUL = whitespace + "\x00" |
|
|
|
// Section 12.2.5.5. |
|
func parseForeignContent(p *parser) bool { |
|
switch p.tok.Type { |
|
case TextToken: |
|
if p.framesetOK { |
|
p.framesetOK = strings.TrimLeft(p.tok.Data, whitespaceOrNUL) == "" |
|
} |
|
p.tok.Data = strings.Replace(p.tok.Data, "\x00", "\ufffd", -1) |
|
p.addText(p.tok.Data) |
|
case CommentToken: |
|
p.addChild(&Node{ |
|
Type: CommentNode, |
|
Data: p.tok.Data, |
|
}) |
|
case StartTagToken: |
|
b := breakout[p.tok.Data] |
|
if p.tok.DataAtom == a.Font { |
|
loop: |
|
for _, attr := range p.tok.Attr { |
|
switch attr.Key { |
|
case "color", "face", "size": |
|
b = true |
|
break loop |
|
} |
|
} |
|
} |
|
if b { |
|
for i := len(p.oe) - 1; i >= 0; i-- { |
|
n := p.oe[i] |
|
if n.Namespace == "" || htmlIntegrationPoint(n) || mathMLTextIntegrationPoint(n) { |
|
p.oe = p.oe[:i+1] |
|
break |
|
} |
|
} |
|
return false |
|
} |
|
switch p.top().Namespace { |
|
case "math": |
|
adjustAttributeNames(p.tok.Attr, mathMLAttributeAdjustments) |
|
case "svg": |
|
// Adjust SVG tag names. The tokenizer lower-cases tag names, but |
|
// SVG wants e.g. "foreignObject" with a capital second "O". |
|
if x := svgTagNameAdjustments[p.tok.Data]; x != "" { |
|
p.tok.DataAtom = a.Lookup([]byte(x)) |
|
p.tok.Data = x |
|
} |
|
adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments) |
|
default: |
|
panic("html: bad parser state: unexpected namespace") |
|
} |
|
adjustForeignAttributes(p.tok.Attr) |
|
namespace := p.top().Namespace |
|
p.addElement() |
|
p.top().Namespace = namespace |
|
if namespace != "" { |
|
// Don't let the tokenizer go into raw text mode in foreign content |
|
// (e.g. in an SVG <title> tag). |
|
p.tokenizer.NextIsNotRawText() |
|
} |
|
if p.hasSelfClosingToken { |
|
p.oe.pop() |
|
p.acknowledgeSelfClosingTag() |
|
} |
|
case EndTagToken: |
|
for i := len(p.oe) - 1; i >= 0; i-- { |
|
if p.oe[i].Namespace == "" { |
|
return p.im(p) |
|
} |
|
if strings.EqualFold(p.oe[i].Data, p.tok.Data) { |
|
p.oe = p.oe[:i] |
|
break |
|
} |
|
} |
|
return true |
|
default: |
|
// Ignore the token. |
|
} |
|
return true |
|
} |
|
|
|
// Section 12.2.5. |
|
func (p *parser) inForeignContent() bool { |
|
if len(p.oe) == 0 { |
|
return false |
|
} |
|
n := p.oe[len(p.oe)-1] |
|
if n.Namespace == "" { |
|
return false |
|
} |
|
if mathMLTextIntegrationPoint(n) { |
|
if p.tok.Type == StartTagToken && p.tok.DataAtom != a.Mglyph && p.tok.DataAtom != a.Malignmark { |
|
return false |
|
} |
|
if p.tok.Type == TextToken { |
|
return false |
|
} |
|
} |
|
if n.Namespace == "math" && n.DataAtom == a.AnnotationXml && p.tok.Type == StartTagToken && p.tok.DataAtom == a.Svg { |
|
return false |
|
} |
|
if htmlIntegrationPoint(n) && (p.tok.Type == StartTagToken || p.tok.Type == TextToken) { |
|
return false |
|
} |
|
if p.tok.Type == ErrorToken { |
|
return false |
|
} |
|
return true |
|
} |
|
|
|
// parseImpliedToken parses a token as though it had appeared in the parser's |
|
// input. |
|
func (p *parser) parseImpliedToken(t TokenType, dataAtom a.Atom, data string) { |
|
realToken, selfClosing := p.tok, p.hasSelfClosingToken |
|
p.tok = Token{ |
|
Type: t, |
|
DataAtom: dataAtom, |
|
Data: data, |
|
} |
|
p.hasSelfClosingToken = false |
|
p.parseCurrentToken() |
|
p.tok, p.hasSelfClosingToken = realToken, selfClosing |
|
} |
|
|
|
// parseCurrentToken runs the current token through the parsing routines |
|
// until it is consumed. |
|
func (p *parser) parseCurrentToken() { |
|
if p.tok.Type == SelfClosingTagToken { |
|
p.hasSelfClosingToken = true |
|
p.tok.Type = StartTagToken |
|
} |
|
|
|
consumed := false |
|
for !consumed { |
|
if p.inForeignContent() { |
|
consumed = parseForeignContent(p) |
|
} else { |
|
consumed = p.im(p) |
|
} |
|
} |
|
|
|
if p.hasSelfClosingToken { |
|
// This is a parse error, but ignore it. |
|
p.hasSelfClosingToken = false |
|
} |
|
} |
|
|
|
func (p *parser) parse() error { |
|
// Iterate until EOF. Any other error will cause an early return. |
|
var err error |
|
for err != io.EOF { |
|
// CDATA sections are allowed only in foreign content. |
|
n := p.oe.top() |
|
p.tokenizer.AllowCDATA(n != nil && n.Namespace != "") |
|
// Read and parse the next token. |
|
p.tokenizer.Next() |
|
p.tok = p.tokenizer.Token() |
|
if p.tok.Type == ErrorToken { |
|
err = p.tokenizer.Err() |
|
if err != nil && err != io.EOF { |
|
return err |
|
} |
|
} |
|
p.parseCurrentToken() |
|
} |
|
return nil |
|
} |
|
|
|
// Parse returns the parse tree for the HTML from the given Reader. |
|
// The input is assumed to be UTF-8 encoded. |
|
func Parse(r io.Reader) (*Node, error) { |
|
p := &parser{ |
|
tokenizer: NewTokenizer(r), |
|
doc: &Node{ |
|
Type: DocumentNode, |
|
}, |
|
scripting: true, |
|
framesetOK: true, |
|
im: initialIM, |
|
} |
|
err := p.parse() |
|
if err != nil { |
|
return nil, err |
|
} |
|
return p.doc, nil |
|
} |
|
|
|
// ParseFragment parses a fragment of HTML and returns the nodes that were |
|
// found. If the fragment is the InnerHTML for an existing element, pass that |
|
// element in context. |
|
func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { |
|
contextTag := "" |
|
if context != nil { |
|
if context.Type != ElementNode { |
|
return nil, errors.New("html: ParseFragment of non-element Node") |
|
} |
|
// The next check isn't just context.DataAtom.String() == context.Data because |
|
// it is valid to pass an element whose tag isn't a known atom. For example, |
|
// DataAtom == 0 and Data = "tagfromthefuture" is perfectly consistent. |
|
if context.DataAtom != a.Lookup([]byte(context.Data)) { |
|
return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data) |
|
} |
|
contextTag = context.DataAtom.String() |
|
} |
|
p := &parser{ |
|
tokenizer: NewTokenizerFragment(r, contextTag), |
|
doc: &Node{ |
|
Type: DocumentNode, |
|
}, |
|
scripting: true, |
|
fragment: true, |
|
context: context, |
|
} |
|
|
|
root := &Node{ |
|
Type: ElementNode, |
|
DataAtom: a.Html, |
|
Data: a.Html.String(), |
|
} |
|
p.doc.AppendChild(root) |
|
p.oe = nodeStack{root} |
|
p.resetInsertionMode() |
|
|
|
for n := context; n != nil; n = n.Parent { |
|
if n.Type == ElementNode && n.DataAtom == a.Form { |
|
p.form = n |
|
break |
|
} |
|
} |
|
|
|
err := p.parse() |
|
if err != nil { |
|
return nil, err |
|
} |
|
|
|
parent := p.doc |
|
if context != nil { |
|
parent = root |
|
} |
|
|
|
var result []*Node |
|
for c := parent.FirstChild; c != nil; { |
|
next := c.NextSibling |
|
parent.RemoveChild(c) |
|
result = append(result, c) |
|
c = next |
|
} |
|
return result, nil |
|
}
|
|
|