-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathner_corpus.py
89 lines (72 loc) · 2.85 KB
/
ner_corpus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#! -*- coding:utf-8 -*-
import json
from tqdm import tqdm
import codecs
import numpy as np
import random
from sklearn.model_selection import train_test_split
import re
emotion = set()
chars = {}
data = []
min_count = 2
with open('./data/coreEntityEmotion_train.txt', encoding='utf-8') as f:
for l in tqdm(f):
a = json.loads(l.strip())
data.append(
{
'content': a['title'] + '\n' + a['content'],
'coreEntityEmotions': [(i['entity'], i['emotion']) for i in a['coreEntityEmotions']],
}
)
for c in a['content']:
chars[c] = chars.get(c, 0) + 1
for c in a['title']:
chars[c] = chars.get(c, 0) + 1
for c in a['coreEntityEmotions']:
emotion.add(c['emotion'])
id2emotion = {i:j for i,j in enumerate(emotion)}
emotion2id = {j:i for i,j in id2emotion.items()}
with open('./ner_data/emotion.json', 'w', encoding='utf-8') as f:
json.dump([id2emotion, emotion2id], f, indent=4, ensure_ascii=False)
with codecs.open('./ner_data/all_train_data.json', 'w', encoding='utf-8') as f:
json.dump(data, f, indent=4, ensure_ascii=False)
train_data, test_data = train_test_split(data, random_state=2019, test_size=0.2)
# new_train_data = []
# for item in train_data:
# contents = re.split(r'[\n。!?]', item['content'])
# for text in contents:
# if len(text) < 5:
# continue
# new_train_data.append(
# {
# 'content': text,
# 'coreEntityEmotions': item['coreEntityEmotions'],
# }
# )
with codecs.open('./ner_data/train_data.json', 'w', encoding='utf-8') as f:
json.dump(train_data, f, indent=4, ensure_ascii=False)
with codecs.open('./ner_data/dev_data.json', 'w', encoding='utf-8') as f:
json.dump(test_data, f, indent=4, ensure_ascii=False)
test_data = []
with open('./data/coreEntityEmotion_test_stage1.txt', encoding='utf-8') as f:
for l in tqdm(f):
a = json.loads(l.strip())
test_data.append(
{
'newsId': a['newsId'],
'content': a['title'] + '\n' + a['content'],
# 'coreEntityEmotions': [(i['entity'], i['emotion']) for i in a['coreEntityEmotions']],
}
)
for c in a['content']:
chars[c] = chars.get(c, 0) + 1
for c in a['title']:
chars[c] = chars.get(c, 0) + 1
with codecs.open('./ner_data/test_data.json', 'w', encoding='utf-8') as f:
json.dump(test_data, f, indent=4, ensure_ascii=False)
with codecs.open('./ner_data/all_chars.json', 'w', encoding='utf-8') as f:
chars = {i:j for i,j in chars.items() if j >= min_count}
id2char = {i+2:j for i,j in enumerate(chars)} # padding: 0, unk: 1
char2id = {j:i for i,j in id2char.items()}
json.dump([id2char, char2id], f, indent=4, ensure_ascii=False)