-
Notifications
You must be signed in to change notification settings - Fork 17
/
clean.go
67 lines (63 loc) · 1.8 KB
/
clean.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
package htmldiff
import (
htm "html"
"strings"
"unicode/utf8"
"golang.org/x/net/html"
"golang.org/x/net/html/atom"
)
// delAttr() deletes an unwanted attribute.
func delAttr(attr []html.Attribute, ai int) (ret []html.Attribute) {
if len(attr) <= 1 || ai >= len(attr) {
return nil
}
return append(attr[:ai], attr[ai+1:]...)
}
// clean normalises styles/colspan and removes any CleanTags specified, along with newlines;
// but also makes all the character handling (for example " " as utf-8) the same.
// It returns the estimated number of treeRunes that will be used.
// TODO more cleaning of the input HTML, as required.
func (c *Config) clean(n *html.Node) int {
size := 1
switch n.Type {
case html.ElementNode:
for ai := 0; ai < len(n.Attr); ai++ {
a := n.Attr[ai]
switch {
case strings.ToLower(a.Key) == "style":
if strings.TrimSpace(a.Val) == "" { // delete empty styles
n.Attr = delAttr(n.Attr, ai)
ai--
} else { // tidy non-empty styles
// TODO there could be more here to make sure the style entries are in the same order etc.
n.Attr[ai].Val = strings.Replace(a.Val, " ", "", -1)
if !strings.HasSuffix(n.Attr[ai].Val, ";") {
n.Attr[ai].Val += ";"
}
}
case n.DataAtom == atom.Td &&
strings.ToLower(a.Key) == "colspan" &&
strings.TrimSpace(a.Val) == "1":
n.Attr = delAttr(n.Attr, ai)
ai--
}
}
case html.TextNode:
n.Data = htm.UnescapeString(n.Data)
size += utf8.RuneCountInString(n.Data) - 1 // len(n.Data) would be faster, but use more memory
}
searchChildren:
for ch := n.FirstChild; ch != nil; ch = ch.NextSibling {
switch ch.Type {
case html.ElementNode:
for _, rr := range c.CleanTags {
if rr == ch.Data {
n.RemoveChild(ch)
goto searchChildren
}
}
}
size += c.clean(ch)
}
return size
}