Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor internal/reader/readability/readability.go #3006

Merged
merged 1 commit into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 23 additions & 29 deletions internal/reader/readability/readability.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
package readability // import "miniflux.app/v2/internal/reader/readability"

import (
"bytes"
"fmt"
"io"
"log/slog"
Expand All @@ -23,7 +22,6 @@ const (

var (
divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
sentenceRegexp = regexp.MustCompile(`\.( |$)`)

blacklistCandidatesRegexp = regexp.MustCompile(`popupbody|-ad|g-plus`)
okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
Expand Down Expand Up @@ -84,7 +82,7 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
}

document.Find("script,style").Each(func(i int, s *goquery.Selection) {
removeNodes(s)
s.Remove()
})

transformMisusedDivsIntoParagraphs(document)
Expand All @@ -106,7 +104,8 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
// Now that we have the top candidate, look through its siblings for content that might also be related.
// Things like preambles, content split by ads that we removed, etc.
func getArticle(topCandidate *candidate, candidates candidateList) string {
output := bytes.NewBufferString("<div>")
var output strings.Builder
output.WriteString("<div>")
siblingScoreThreshold := max(10, topCandidate.score*.2)

topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) {
Expand All @@ -124,10 +123,14 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
content := s.Text()
contentLength := len(content)

if contentLength >= 80 && linkDensity < .25 {
append = true
} else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) {
append = true
if contentLength >= 80 {
if linkDensity < .25 {
append = true
}
} else {
if linkDensity == 0 && containsSentence(content) {
append = true
}
}
}

Expand All @@ -138,7 +141,7 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
}

html, _ := s.Html()
fmt.Fprintf(output, "<%s>%s</%s>", tag, html, tag)
output.WriteString("<" + tag + ">" + html + "</" + tag + ">")
}
})

Expand All @@ -156,9 +159,9 @@ func removeUnlikelyCandidates(document *goquery.Document) {
str := strings.ToLower(class + id)

if blacklistCandidatesRegexp.MatchString(str) {
removeNodes(s)
s.Remove()
} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
removeNodes(s)
s.Remove()
}
})
}
Expand Down Expand Up @@ -222,7 +225,7 @@ func getCandidates(document *goquery.Document) candidateList {
contentScore += float32(strings.Count(text, ",") + 1)

// For every 100 characters in this paragraph, add another point. Up to 3 points.
contentScore += float32(min(int(len(text)/100.0), 3))
contentScore += float32(min(len(text)/100.0, 3))

candidates[parentNode].score += contentScore
if grandParentNode != nil {
Expand Down Expand Up @@ -261,13 +264,14 @@ func scoreNode(s *goquery.Selection) *candidate {
// Get the density of links as a percentage of the content
// This is the amount of text that is inside a link divided by the total text in the node.
func getLinkDensity(s *goquery.Selection) float32 {
linkLength := len(s.Find("a").Text())
textLength := len(s.Text())

if textLength == 0 {
return 0
}

linkLength := len(s.Find("a").Text())

return float32(linkLength) / float32(textLength)
}

Expand All @@ -278,25 +282,20 @@ func getClassWeight(s *goquery.Selection) float32 {
class, _ := s.Attr("class")
id, _ := s.Attr("id")

class = strings.ToLower(class)
id = strings.ToLower(id)

if class != "" {
class = strings.ToLower(class)
if negativeRegexp.MatchString(class) {
weight -= 25
}

if positiveRegexp.MatchString(class) {
} else if positiveRegexp.MatchString(class) {
weight += 25
}
}

if id != "" {
id = strings.ToLower(id)
if negativeRegexp.MatchString(id) {
weight -= 25
}

if positiveRegexp.MatchString(id) {
} else if positiveRegexp.MatchString(id) {
weight += 25
}
}
Expand All @@ -314,11 +313,6 @@ func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
})
}

func removeNodes(s *goquery.Selection) {
s.Each(func(i int, s *goquery.Selection) {
parent := s.Parent()
if parent.Length() > 0 {
parent.Get(0).RemoveChild(s.Get(0))
}
})
func containsSentence(content string) bool {
return strings.HasSuffix(content, ".") || strings.Contains(content, ". ")
}
61 changes: 61 additions & 0 deletions internal/reader/readability/readability_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,64 @@ func TestWithoutBaseURL(t *testing.T) {
t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
}
}

func TestRemoveStyleScript(t *testing.T) {
html := `
<html>
<head>
<title>Test</title>
<script src="tololo.js"></script>
</head>
<body>
<script src="tololo.js"></script>
<style>
h1 {color:red;}
p {color:blue;}
</style>
<article>Some content</article>
</body>
</html>`
want := `<div><div><article>Somecontent</article></div></div>`

_, content, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}

content = strings.ReplaceAll(content, "\n", "")
content = strings.ReplaceAll(content, " ", "")
content = strings.ReplaceAll(content, "\t", "")

if content != want {
t.Errorf(`Invalid content, got %s instead of %s`, content, want)
}
}

func TestRemoveBlacklist(t *testing.T) {
html := `
<html>
<head>
<title>Test</title>
</head>
<body>
<article class="super-ad">Some content</article>
<article class="g-plus-crap">Some other thing</article>
<article class="stuff popupbody">And more</article>
<article class="legit">Valid!</article>
</body>
</html>`
want := `<div><div><articleclass="legit">Valid!</article></div></div>`

_, content, err := ExtractContent(strings.NewReader(html))
if err != nil {
t.Fatal(err)
}

content = strings.ReplaceAll(content, "\n", "")
content = strings.ReplaceAll(content, " ", "")
content = strings.ReplaceAll(content, "\t", "")

if content != want {
t.Errorf(`Invalid content, got %s instead of %s`, content, want)
}
}
Loading