-
Notifications
You must be signed in to change notification settings - Fork 1
/
data_helper.py
128 lines (109 loc) · 4.02 KB
/
data_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# Borrowed from: https://github.com/guillaumegenthial/sequence_tagging/blob/master/model/data_utils.py
from config import DefaultConfig as cfg
def pad_sequences(sequences, pad_token, type):
'''
add pad_token to the words, or sentences to have same length
:param sequences: a list of words or sentences
:param pad_token: the value should be added to all sequences
:param type: either 'words' or 'sentences'
:return: a list of words or sentences with same length
'''
if type == 'sentences':
max_length = max(map(lambda x: len(x), sequences))
sequence_padded, sequence_length = add_pad(sequences, pad_token, max_length)
elif type == 'words':
max_length_word = cfg.MAX_LENGTH_WORD#max([max(map(lambda x : len(x), seq)) for seq in sequences])
sequence_padded, sequence_length = [], []
for seq in sequences:
sp, sl = add_pad(seq, pad_token, max_length_word)
sequence_padded += [sp]
sequence_length += [sl]
max_length_sentence = max(map(lambda x: len(x), sequences))
sequence_padded, _ = add_pad(sequence_padded, [pad_token]*max_length_word, max_length_sentence)
sequence_length, _ = add_pad(sequence_length, 0, max_length_sentence)
return sequence_padded, sequence_length
def add_pad(sequences, pad_token, max_length):
'''
add pad to sequences
:param sequences: a list
:param pad_token: pad token
:param max_length: maximum length to be padded
:return: Padded sequence and sequence length
'''
sequence_padded, sequence_length = [], []
for seq in sequences:
seq = list(seq)
seq_ = seq[:max_length] + [pad_token]*max(max_length - len(seq), 0)
sequence_padded += [seq_]
sequence_length += [min(len(seq), max_length)]
return sequence_padded, sequence_length
def batch_gen(data, minibatch_size):
"""
Args:
data: generator of (sentence, tags) tuples
minibatch_size: (int)
Returns:
list of tuples
"""
x_batch, y_batch = [], []
for (x, y) in data:
if len(x_batch) == minibatch_size:
yield x_batch, y_batch
x_batch, y_batch = [], []
if type(x[0]) == tuple:
x = zip(*x)
x_batch += [x]
y_batch += [y]
if len(x_batch) != 0:
yield x_batch, y_batch
def get_chunk_type(tok, idx_to_tag):
"""
Args:
tok: id of token, ex 4
idx_to_tag: dictionary {4: "B-PER", ...}
Returns:
tuple: "B", "PER"
"""
tag_name = idx_to_tag[tok]
tag_class = tag_name.split('-')[0]
tag_type = tag_name.split('-')[-1]
return tag_class, tag_type
def get_chunks(seq, tags):
"""
Args:
seq: [4, 4, 0, 0, ...] sequence of labels
tags: dict["O"] = 4
Returns:
list of (chunk_type, chunk_start, chunk_end)
Example:
seq = [4, 5, 0, 3]
tags = {"B-PER": 4, "I-PER": 5, "B-LOC": 3}
result = [("PER", 0, 2), ("LOC", 3, 4)]
"""
default = tags[cfg.NONE]
idx_to_tag = {idx: tag for tag, idx in tags.items()}
chunks = []
chunk_type, chunk_start = None, None
for i, tok in enumerate(seq):
# End of a chunk 1
if tok == default and chunk_type is not None:
# Add a chunk.
chunk = (chunk_type, chunk_start, i)
chunks.append(chunk)
chunk_type, chunk_start = None, None
# End of a chunk + start of a chunk!
elif tok != default:
tok_chunk_class, tok_chunk_type = get_chunk_type(tok, idx_to_tag)
if chunk_type is None:
chunk_type, chunk_start = tok_chunk_type, i
elif tok_chunk_type != chunk_type or tok_chunk_class == "B":
chunk = (chunk_type, chunk_start, i)
chunks.append(chunk)
chunk_type, chunk_start = tok_chunk_type, i
else:
pass
# end condition
if chunk_type is not None:
chunk = (chunk_type, chunk_start, len(seq))
chunks.append(chunk)
return chunks