-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtoken.go
110 lines (94 loc) · 1.98 KB
/
token.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
package jargon
import (
"unicode"
)
// Token represents a piece of text with metadata.
type Token struct {
value string
punct, space, lemma bool
}
// String is the string value of the token
func (t *Token) String() string {
return t.value
}
// IsPunct indicates that the token should be considered 'breaking' of a run of words. Mostly uses
// Unicode's definition of punctuation, with some exceptions for our purposes.
func (t *Token) IsPunct() bool {
return t.punct
}
// IsSpace indicates that the token consists entirely of white space, as defined by the unicode package.
//
//A token can be both IsPunct and IsSpace -- for example, line breaks and tabs are punctuation for our purposes.
func (t *Token) IsSpace() bool {
return t.space
}
// IsLemma indicates that the token is a lemma, i.e., a canonical term that replaced original token(s).
func (t *Token) IsLemma() bool {
return t.lemma
}
// NewToken creates a new token, and calculates whether the token is space or punct.
func NewToken(s string, isLemma bool) *Token {
token, found := common[s][isLemma]
if found {
return token
}
if len(s) == 0 {
return nil
}
var punct, space bool
switch {
case s == "\r\n":
punct = true
space = true
default:
punct = true
for _, r := range s {
if !isPunct(r) {
punct = false
break
}
}
space = true
for _, r := range s {
if !unicode.IsSpace(r) {
space = false
break
}
}
}
return &Token{
value: s,
punct: punct,
space: space,
lemma: isLemma,
}
}
var common = make(map[string]map[bool]*Token)
func init() {
ss := []string{
" ", "\r", "\n", "\t", ".", ",",
"A", "a",
"An", "an",
"The", "the",
"And", "and",
"Or", "or",
"Not", "not",
"Of", "of",
"In", "in",
"On", "on",
"To", "to",
"Be", "be",
"Is", "is",
"Are", "are",
"Has", "has",
"Have", "have",
"It", "it",
"Do", "do",
}
for _, s := range ss {
common[s] = map[bool]*Token{
true: NewToken(s, true),
false: NewToken(s, false),
}
}
}