forked from twktheainur/kit-mld-ke
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_terminology_mlteacter.py
109 lines (85 loc) · 3.65 KB
/
extract_terminology_mlteacter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import json
# torch and tranformers for model and training
import torch
from transformers import XLMRobertaTokenizer, AutoTokenizer
from transformers import XLMRobertaForSequenceClassification
max_len = 512
def extract_terms(validation_df, xlmr_model, xlmr_tokenizer, device="cpu"):
print(len(validation_df))
term_list = []
# put model in evaluation mode
xlmr_model.eval()
for index, row in validation_df.iterrows():
sentence = row['n_gram'] + ". " + row["Context"]
label = validation_df["Label"]
encoded_dict = xlmr_tokenizer.encode_plus(sentence,
max_length=max_len,
padding='max_length',
truncation=True,
return_tensors='pt')
input_id = encoded_dict['input_ids'].to(device)
attn_mask = encoded_dict['attention_mask'].to(device)
label = torch.tensor(0).to(device)
with torch.no_grad():
output = xlmr_model(input_id,
token_type_ids=None,
attention_mask=attn_mask,
labels=label)
loss = output.loss
logits = output.logits
logits = logits.detach().cpu().numpy()
pred = labels[logits[0].argmax(axis=0)]
if pred == "Term":
term_list.append(row['n_gram'])
return set(term_list)
full_metadata = {}
with open("motion_data_full.json") as fp:
full_metadata.update(json.load(fp))
cmu_taxonomy = full_metadata['cmu_description_taxonomy']
del full_metadata['cmu_description_taxonomy']
kit_taxonomy = full_metadata['kit_description_taxonomy']
del full_metadata['kit_description_taxonomy']
corpus = []
for key in full_metadata.keys():
annotations = []
with open(f"data/{str(key).zfill(5)}_annotations.json", "r") as fp_annot:
annotations.extend(json.load(fp_annot))
record = full_metadata[key]
source = record['metadata']['source']['database']['identifier']
additional_text = ""
if source == "cmu":
# additional_text = record['description']
pass
elif source == "kit":
# additional_text += record['comment']
pass
for annotation in annotations:
if annotation.endswith("."):
annotation = annotation[:-1]
corpus.append(annotation)
# corpus += f"{additional_text}\n"
xlmr_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
labels_ = []
input_ids_ = []
attn_masks_ = []
# for each datasample:
for sentence in corpus:
# create requiered input, i.e. ids and attention masks
encoded_dict = xlmr_tokenizer.encode_plus(sentence,
max_length=512,
padding='max_length',
truncation=True,
return_tensors='pt')
# add encoded sample to lists
input_ids_.append(encoded_dict['input_ids'])
attn_masks_.append(encoded_dict['attention_mask'])
labels_.append(row['Label'])
# Convert each Python list of Tensors into a 2D Tensor matrix.
input_ids_ = torch.cat(input_ids_, dim=0)
attn_masks_ = torch.cat(attn_masks_, dim=0)
# labels to tensor
labels_ = torch.tensor(labels_)
checkpoint = torch.load('checkpoints/checkpoint_3.pth.tar')
xlmr_model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)
xlmr_model.load_state_dict(checkpoint['model_state_dict'])
extracted_terms = extract_terms(train_data_lombalgie, xlmr_model, xlmr_tokenizer)