-
Notifications
You must be signed in to change notification settings - Fork 4
/
author.go
89 lines (77 loc) · 1.6 KB
/
author.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
package textractor
import (
"regexp"
"sort"
"strings"
"unicode/utf8"
"github.com/PuerkitoBio/goquery"
)
const patternSuffix = `[:|:| |丨|/]\s*([\p{Han}]{2,20}|[\w]{2,60})`
var authorPattern = []string{
`author`,
`责编`,
`责任编辑`,
`作者`,
`记者`,
`编辑`,
`原创`,
`文`,
`撰文`,
`来源`,
`稿件`,
`发稿人`,
`投稿人`,
`投稿`,
`来自`,
}
var headAuthorPattern = []string{
"author",
}
var authorPatternRx []*regexp.Regexp
func init() {
if _, err := regexp.Compile(patternSuffix); err != nil {
panic(err)
}
sort.Slice(authorPattern, func(i, j int) bool {
return len(authorPattern[i]) > len(authorPattern[j])
})
for _, v := range authorPattern {
if rx, err := regexp.Compile(v + patternSuffix); err == nil {
authorPatternRx = append(authorPatternRx, rx)
}
}
}
// authorExtract 提取文章作者
func authorExtract(headText []*headEntry, body *goquery.Selection) string {
for _, v := range headText {
for _, px := range headAuthorPattern {
if strings.Contains(v.key, px) {
return v.val
}
}
}
var text []string
for _, v := range iterator(body) {
if goquery.NodeName(v) == "#text" {
t := strings.TrimSpace(v.Text())
length := utf8.RuneCountInString(t)
if t != "" && length >= 4 && length <= 25 {
text = append(text, t)
}
}
}
for _, t := range text {
if author, ok := matchAuthor(t); ok {
return author
}
}
return ""
}
func matchAuthor(text string) (string, bool) {
for _, rx := range authorPatternRx {
if rx.MatchString(text) {
return rx.ReplaceAllString(rx.FindString(text), "$1"), true
}
}
return "", false
}