-
Notifications
You must be signed in to change notification settings - Fork 0
/
graphDataProcessor.py
136 lines (116 loc) · 4.45 KB
/
graphDataProcessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from transformers import BertTokenizer
import re
import torch
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
MAX_LEN = 20
# Create a function to tokenize a set of texts
def preprocessing_for_bert(data):
"""Perform required preprocessing steps for pretrained BERT.
@param data (np.array): Array of texts to be processed.
@return input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
@return attention_masks (torch.Tensor): Tensor of indices specifying which
tokens should be attended to by the model.
"""
# Create empty lists to store outputs
input_ids = []
attention_masks = []
# For every sentence...
for sent in data:
# `encode_plus` will:
# (1) Tokenize the sentence
# (2) Add the `[CLS]` and `[SEP]` token to the start and end
# (3) Truncate/Pad sentence to max length
# (4) Map tokens to their IDs
# (5) Create attention mask
# (6) Return a dictionary of outputs
encoded_sent = tokenizer.encode_plus(
text=text_preprocessing(sent), # Preprocess sentence
text_pair= b[i],
add_special_tokens=True, # Add `[CLS]` and `[SEP]`
max_length=MAX_LEN, # Max length to truncate/pad
padding=True,
truncation_strategy='longest_first',
truncation=True,
#pad_to_max_length=True, # Pad sentence to max length
#return_tensors='pt', # Return PyTorch tensor
return_attention_mask=True # Return attention mask
)
# Add the outputs to the lists
input_ids.append(encoded_sent.get('input_ids'))
attention_masks.append(encoded_sent.get('attention_mask'))
# Convert lists to tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
return input_ids, attention_masks
def text_preprocessing(text):
"""
- Remove entity mentions (eg. '@united')
- Correct errors (eg. '&' to '&')
@param text (str): a string to be processed.
@return text (Str): the processed string.
"""
# Remove '@name'
text = re.sub(r'(@.*?)[\s]', ' ', text)
# Replace '&' with '&'
text = re.sub(r'&', '&', text)
# Remove trailing whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
def preprocessing_for_bert_single(tweet_id):
# Create empty lists to store outputs
input_ids = []
attention_masks = []
target_df = df.loc[df['tweet_id'] == tweet_id]
sent = target_df['source'].iloc[0]
comments = target_df['replies'].iloc[0]
comments_str = ''
for c in comments:
comments_str = comments_str + c
encoded_sent = tokenizer.encode_plus(
text=text_preprocessing(sent), # Preprocess sentence
text_pair= text_preprocessing(comments_str), # All the comments as one string
add_special_tokens=True, # Add `[CLS]` and `[SEP]`
max_length=MAX_LEN, # Max length to truncate/pad
padding=True, # Pad sentence to max length
truncation_strategy='longest_first',
truncation=True,
#return_tensors='pt', # Return PyTorch tensor
return_attention_mask=True # Return attention mask
)
# Add the outputs to the lists
input_ids.append(encoded_sent.get('input_ids'))
attention_masks.append(encoded_sent.get('attention_mask'))
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
return input_ids, attention_masks
def preprocessing_for_bert_latest(root_node,node_content):
# Create empty lists to store outputs
input_ids = []
attention_masks = []
node_lst = []
for node in node_content:
node_lst.append(node)
whole_lst = []
whole_lst = root_node.tolist() + node_lst
for c in whole_lst:
encoded_sent = tokenizer.encode_plus(
text=root_node[0], # Preprocess sentence
text_pair= c,
add_special_tokens=True, # Add `[CLS]` and `[SEP]`
max_length=MAX_LEN, # Max length to truncate/pad
padding=True, # Pad sentence to max length
truncation_strategy='longest_first',
truncation=True,
#return_tensors='pt', # Return PyTorch tensor
return_attention_mask=True # Return attention mask
)
# Add the outputs to the lists
input_ids.append(encoded_sent.get('input_ids'))
attention_masks.append(encoded_sent.get('attention_mask'))
#final_input_ids = np.array([input_ids])
#final_attention_masks = np.array([attention_masks])
# Convert lists to tensors
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
return input_ids, attention_masks