From 511cc3a40645a2f6ed3d21a1d0803b5057b9aaa1 Mon Sep 17 00:00:00 2001 From: Carlana Johnson Date: Tue, 29 Oct 2024 01:29:34 +0000 Subject: [PATCH] html: add Node.{Ancestors,ChildNodes,Descendants}() Adds iterators for the parents, immediate children, and all children of a Node respectively. Fixes golang/go#62113 Change-Id: Iab015872cc3a20fe5e7cae3bc90b89cba68cc3f8 GitHub-Last-Rev: d99de580ab8c30bb04c9fee401d6bfc5dd55d1ec GitHub-Pull-Request: golang/net#215 Reviewed-on: https://go-review.googlesource.com/c/net/+/594195 Reviewed-by: Ian Lance Taylor LUCI-TryBot-Result: Go LUCI Auto-Submit: Ian Lance Taylor Reviewed-by: Damien Neil --- html/doc.go | 7 +--- html/example_test.go | 13 +++--- html/iter.go | 56 ++++++++++++++++++++++++++ html/iter_test.go | 96 ++++++++++++++++++++++++++++++++++++++++++++ html/node.go | 4 ++ 5 files changed, 163 insertions(+), 13 deletions(-) create mode 100644 html/iter.go create mode 100644 html/iter_test.go diff --git a/html/doc.go b/html/doc.go index 3a7e5ab176..885c4c5936 100644 --- a/html/doc.go +++ b/html/doc.go @@ -78,16 +78,11 @@ example, to process each anchor node in depth-first order: if err != nil { // ... } - var f func(*html.Node) - f = func(n *html.Node) { + for n := range doc.Descendants() { if n.Type == html.ElementNode && n.Data == "a" { // Do something with n... } - for c := n.FirstChild; c != nil; c = c.NextSibling { - f(c) - } } - f(doc) The relevant specifications include: https://html.spec.whatwg.org/multipage/syntax.html and diff --git a/html/example_test.go b/html/example_test.go index 0b06ed7730..830f0b27af 100644 --- a/html/example_test.go +++ b/html/example_test.go @@ -2,6 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. +//go:build go1.23 + // This example demonstrates parsing HTML data and walking the resulting tree. package html_test @@ -11,6 +13,7 @@ import ( "strings" "golang.org/x/net/html" + "golang.org/x/net/html/atom" ) func ExampleParse() { @@ -19,9 +22,8 @@ func ExampleParse() { if err != nil { log.Fatal(err) } - var f func(*html.Node) - f = func(n *html.Node) { - if n.Type == html.ElementNode && n.Data == "a" { + for n := range doc.Descendants() { + if n.Type == html.ElementNode && n.DataAtom == atom.A { for _, a := range n.Attr { if a.Key == "href" { fmt.Println(a.Val) @@ -29,11 +31,8 @@ func ExampleParse() { } } } - for c := n.FirstChild; c != nil; c = c.NextSibling { - f(c) - } } - f(doc) + // Output: // foo // /bar/baz diff --git a/html/iter.go b/html/iter.go new file mode 100644 index 0000000000..54be8fd30f --- /dev/null +++ b/html/iter.go @@ -0,0 +1,56 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.23 + +package html + +import "iter" + +// Ancestors returns an iterator over the ancestors of n, starting with n.Parent. +// +// Mutating a Node or its parents while iterating may have unexpected results. +func (n *Node) Ancestors() iter.Seq[*Node] { + _ = n.Parent // eager nil check + + return func(yield func(*Node) bool) { + for p := n.Parent; p != nil && yield(p); p = p.Parent { + } + } +} + +// ChildNodes returns an iterator over the immediate children of n, +// starting with n.FirstChild. +// +// Mutating a Node or its children while iterating may have unexpected results. +func (n *Node) ChildNodes() iter.Seq[*Node] { + _ = n.FirstChild // eager nil check + + return func(yield func(*Node) bool) { + for c := n.FirstChild; c != nil && yield(c); c = c.NextSibling { + } + } + +} + +// Descendants returns an iterator over all nodes recursively beneath +// n, excluding n itself. Nodes are visited in depth-first preorder. +// +// Mutating a Node or its descendants while iterating may have unexpected results. +func (n *Node) Descendants() iter.Seq[*Node] { + _ = n.FirstChild // eager nil check + + return func(yield func(*Node) bool) { + n.descendants(yield) + } +} + +func (n *Node) descendants(yield func(*Node) bool) bool { + for c := range n.ChildNodes() { + if !yield(c) || !c.descendants(yield) { + return false + } + } + return true +} diff --git a/html/iter_test.go b/html/iter_test.go new file mode 100644 index 0000000000..cca7f82f54 --- /dev/null +++ b/html/iter_test.go @@ -0,0 +1,96 @@ +// Copyright 2024 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build go1.23 + +package html + +import ( + "strings" + "testing" +) + +func TestNode_ChildNodes(t *testing.T) { + tests := []struct { + in string + want string + }{ + {"", ""}, + {"", "a"}, + {"a", "a"}, + {"", "a b"}, + {"ac", "a b c"}, + {"ad", "a b d"}, + {"cefi", "a f g h"}, + } + for _, test := range tests { + doc, err := Parse(strings.NewReader(test.in)) + if err != nil { + t.Fatal(err) + } + // Drill to + n := doc.FirstChild.FirstChild.NextSibling + var results []string + for c := range n.ChildNodes() { + results = append(results, c.Data) + } + if got := strings.Join(results, " "); got != test.want { + t.Errorf("ChildNodes = %q, want %q", got, test.want) + } + } +} + +func TestNode_Descendants(t *testing.T) { + tests := []struct { + in string + want string + }{ + {"", ""}, + {"", "a"}, + {"", "a b"}, + {"b", "a b"}, + {"", "a b"}, + {"bd", "a b c d"}, + {"be", "a b c d e"}, + {"dfgj", "a b c d e f g h i j"}, + } + for _, test := range tests { + doc, err := Parse(strings.NewReader(test.in)) + if err != nil { + t.Fatal(err) + } + // Drill to + n := doc.FirstChild.FirstChild.NextSibling + var results []string + for c := range n.Descendants() { + results = append(results, c.Data) + } + if got := strings.Join(results, " "); got != test.want { + t.Errorf("Descendants = %q; want: %q", got, test.want) + } + } +} + +func TestNode_Ancestors(t *testing.T) { + for _, size := range []int{0, 1, 2, 10, 100, 10_000} { + n := buildChain(size) + nParents := 0 + for _ = range n.Ancestors() { + nParents++ + } + if nParents != size { + t.Errorf("number of Ancestors = %d; want: %d", nParents, size) + } + } +} + +func buildChain(size int) *Node { + child := new(Node) + for range size { + parent := child + child = new(Node) + parent.AppendChild(child) + } + return child +} diff --git a/html/node.go b/html/node.go index 1350eef22c..77741a1950 100644 --- a/html/node.go +++ b/html/node.go @@ -38,6 +38,10 @@ var scopeMarker = Node{Type: scopeMarkerNode} // that it looks like "a