-
Notifications
You must be signed in to change notification settings - Fork 9
/
discover_new_words.py
38 lines (30 loc) · 971 Bytes
/
discover_new_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
#-*- coding: utf-8 -*-
import sys
import codecs
def discover_new_words(title, content):
words_set = set()
t_len, c_len = len(title), len(content)
print title
print content
for step in range(3, 7):
for i in range(t_len):
if i + step > t_len:
continue
word = title[i : i + step]
if word in content:
words_set.add(word)
print word
return words_set
if __name__ == "__main__":
input_file = sys.argv[1]
output_file = sys.argv[2]
new_words_set = set()
with codecs.open(input_file, "r", "utf-8") as rfd:
rfd.readline()
for line in rfd:
idx, title, content = line.strip().split("\t", 2)
new_words = discover_new_words(title, content)
new_words_set.update(new_words)
with codecs.open(output_file, "r", "utf-8") as wfd:
for w in new_words_set:
wfd.write("%s\n" % w)