-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgodensity_test.go
91 lines (75 loc) · 2.31 KB
/
godensity_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
package godensity
import (
"fmt"
"log"
"net/http"
"sort"
"strings"
"testing"
"github.com/PuerkitoBio/goquery"
)
// TestDensity tests the density analysis of various web pages
func TestDensity(t *testing.T) {
urls := []string{
"https://news.v.daum.net/v/20200216130927758#none",
// Additional URLs can be added as needed
}
for _, url := range urls {
// Fetch the webpage
res, err := http.Get(url)
if err != nil {
log.Fatalf("Failed to get URL %s: %v", url, err)
}
defer res.Body.Close()
// Parse the HTML document
doc, err := goquery.NewDocumentFromResponse(res)
if err != nil {
log.Fatalf("Failed to parse HTML from URL %s: %v", url, err)
}
// Extract and process the body of the page
body := doc.Find("body")
Filtering(body) // Apply filtering logic
diveIntoDOM(body, url) // Analyze DOM for density
// Sort nodes based on densitySum
sort.Slice(array, func(i, j int) bool {
return array[i].densitySum > array[j].densitySum
})
// Define candidates and threshold
var candidates []Node
var threshold float32
log.Println("------------")
log.Printf("Analyzing URL: %s", url)
// Analyze elements and determine the threshold
for _, element := range array {
nodeID, _ := element.goqueryNode.Attr("id")
log.Printf("Node: %s | Density: %.2f | DensitySum: %.2f | ID: %s", goquery.NodeName(element.goqueryNode), element.density, element.densitySum, nodeID)
if strings.Contains(goquery.NodeName(element.goqueryNode), "body") {
threshold = element.density
break
}
candidates = append(candidates, element)
}
// If no candidates, select a default fallback
if len(candidates) == 0 {
candidates = append(candidates, array[1])
}
// Display the top 10 elements based on density
array = array[:10]
for _, c := range array {
id, _ := c.goqueryNode.Attr("id")
class, _ := c.goqueryNode.Attr("class")
fmt.Printf("Node: %s | Density: %.2f | DensitySum: %.2f | ID: %s | Class: %s\n", goquery.NodeName(c.goqueryNode), c.density, c.densitySum, id, class)
}
// Process candidates with density above the threshold
for _, element := range candidates {
cursor := &element
if element.density < threshold {
continue
}
log.Printf("Selected Node: %v", cursor)
break
}
// Clear array for next iteration
array = array[:0]
}
}