Skip to content

Commit

Permalink
html: add Node.{Ancestors,ChildNodes,Descendants}()
Browse files Browse the repository at this point in the history
Adds iterators for the parents, immediate children, and all children of a Node respectively.

Fixes golang/go#62113

Change-Id: Iab015872cc3a20fe5e7cae3bc90b89cba68cc3f8
GitHub-Last-Rev: d99de58
GitHub-Pull-Request: #215
Reviewed-on: https://go-review.googlesource.com/c/net/+/594195
Reviewed-by: Ian Lance Taylor <iant@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Ian Lance Taylor <iant@google.com>
Reviewed-by: Damien Neil <dneil@google.com>
  • Loading branch information
earthboundkid authored and gopherbot committed Oct 29, 2024
1 parent 4783315 commit 511cc3a
Show file tree
Hide file tree
Showing 5 changed files with 163 additions and 13 deletions.
7 changes: 1 addition & 6 deletions html/doc.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,11 @@ example, to process each anchor node in depth-first order:
if err != nil {
// ...
}
var f func(*html.Node)
f = func(n *html.Node) {
for n := range doc.Descendants() {
if n.Type == html.ElementNode && n.Data == "a" {
// Do something with n...
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
The relevant specifications include:
https://html.spec.whatwg.org/multipage/syntax.html and
Expand Down
13 changes: 6 additions & 7 deletions html/example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build go1.23

// This example demonstrates parsing HTML data and walking the resulting tree.
package html_test

Expand All @@ -11,6 +13,7 @@ import (
"strings"

"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)

func ExampleParse() {
Expand All @@ -19,21 +22,17 @@ func ExampleParse() {
if err != nil {
log.Fatal(err)
}
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for n := range doc.Descendants() {
if n.Type == html.ElementNode && n.DataAtom == atom.A {
for _, a := range n.Attr {
if a.Key == "href" {
fmt.Println(a.Val)
break
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)

// Output:
// foo
// /bar/baz
Expand Down
56 changes: 56 additions & 0 deletions html/iter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build go1.23

package html

import "iter"

// Ancestors returns an iterator over the ancestors of n, starting with n.Parent.
//
// Mutating a Node or its parents while iterating may have unexpected results.
func (n *Node) Ancestors() iter.Seq[*Node] {
_ = n.Parent // eager nil check

return func(yield func(*Node) bool) {
for p := n.Parent; p != nil && yield(p); p = p.Parent {
}
}
}

// ChildNodes returns an iterator over the immediate children of n,
// starting with n.FirstChild.
//
// Mutating a Node or its children while iterating may have unexpected results.
func (n *Node) ChildNodes() iter.Seq[*Node] {
_ = n.FirstChild // eager nil check

return func(yield func(*Node) bool) {
for c := n.FirstChild; c != nil && yield(c); c = c.NextSibling {
}
}

}

// Descendants returns an iterator over all nodes recursively beneath
// n, excluding n itself. Nodes are visited in depth-first preorder.
//
// Mutating a Node or its descendants while iterating may have unexpected results.
func (n *Node) Descendants() iter.Seq[*Node] {
_ = n.FirstChild // eager nil check

return func(yield func(*Node) bool) {
n.descendants(yield)
}
}

func (n *Node) descendants(yield func(*Node) bool) bool {
for c := range n.ChildNodes() {
if !yield(c) || !c.descendants(yield) {
return false
}
}
return true
}
96 changes: 96 additions & 0 deletions html/iter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
// Copyright 2024 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build go1.23

package html

import (
"strings"
"testing"
)

func TestNode_ChildNodes(t *testing.T) {
tests := []struct {
in string
want string
}{
{"", ""},
{"<a></a>", "a"},
{"a", "a"},
{"<a></a><!--b-->", "a b"},
{"a<b></b>c", "a b c"},
{"a<b><!--c--></b>d", "a b d"},
{"<a><b>c<!--d-->e</b></a>f<!--g--><h>i</h>", "a f g h"},
}
for _, test := range tests {
doc, err := Parse(strings.NewReader(test.in))
if err != nil {
t.Fatal(err)
}
// Drill to <html><head></head><body>
n := doc.FirstChild.FirstChild.NextSibling
var results []string
for c := range n.ChildNodes() {
results = append(results, c.Data)
}
if got := strings.Join(results, " "); got != test.want {
t.Errorf("ChildNodes = %q, want %q", got, test.want)
}
}
}

func TestNode_Descendants(t *testing.T) {
tests := []struct {
in string
want string
}{
{"", ""},
{"<a></a>", "a"},
{"<a><b></b></a>", "a b"},
{"<a>b</a>", "a b"},
{"<a><!--b--></a>", "a b"},
{"<a>b<c></c>d</a>", "a b c d"},
{"<a>b<c><!--d--></c>e</a>", "a b c d e"},
{"<a><b><c>d<!--e-->f</c></b>g<!--h--><i>j</i></a>", "a b c d e f g h i j"},
}
for _, test := range tests {
doc, err := Parse(strings.NewReader(test.in))
if err != nil {
t.Fatal(err)
}
// Drill to <html><head></head><body>
n := doc.FirstChild.FirstChild.NextSibling
var results []string
for c := range n.Descendants() {
results = append(results, c.Data)
}
if got := strings.Join(results, " "); got != test.want {
t.Errorf("Descendants = %q; want: %q", got, test.want)
}
}
}

func TestNode_Ancestors(t *testing.T) {
for _, size := range []int{0, 1, 2, 10, 100, 10_000} {
n := buildChain(size)
nParents := 0
for _ = range n.Ancestors() {
nParents++
}
if nParents != size {
t.Errorf("number of Ancestors = %d; want: %d", nParents, size)
}
}
}

func buildChain(size int) *Node {
child := new(Node)
for range size {
parent := child
child = new(Node)
parent.AppendChild(child)
}
return child
}
4 changes: 4 additions & 0 deletions html/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ var scopeMarker = Node{Type: scopeMarkerNode}
// that it looks like "a<b" rather than "a&lt;b". For element nodes, DataAtom
// is the atom for Data, or zero if Data is not a known tag name.
//
// Node trees may be navigated using the link fields (Parent,
// FirstChild, and so on) or a range loop over iterators such as
// [Node.Descendants].
//
// An empty Namespace implies a "http://www.w3.org/1999/xhtml" namespace.
// Similarly, "math" is short for "http://www.w3.org/1998/Math/MathML", and
// "svg" is short for "http://www.w3.org/2000/svg".
Expand Down

0 comments on commit 511cc3a

Please sign in to comment.