-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathutils.py
85 lines (68 loc) · 2.94 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import torch
import clip
if torch.cuda.is_available():
device = torch.device("cuda:0")
torch.cuda.set_device(device)
else:
device = torch.device("cpu")
clip_model, preprocess = clip.load('ViT-B/32', device, jit=False)
def chunking_sentence(sentence, use_template=False, obj_part=False):
'''
template: A 3D rendering of xxx in unreal engine.
'''
from nltk import word_tokenize, pos_tag, RegexpParser
# 定义一个函数来划分名词短语
def extract_noun_phrases(sentence):
def merge_(noun_phrases1, noun_phrases2):
if len(noun_phrases2) == 0: return noun_phrases1
idx_list = []
for idx in range(len(noun_phrases1)):
phrase1 = noun_phrases1[idx]
for phrase2 in noun_phrases2:
if phrase1 in phrase2:
idx_list.append(idx)
for idx in range(len(noun_phrases1)):
if idx not in idx_list:
noun_phrases2.append(noun_phrases1[idx])
# return noun_phrases2
return sorted(noun_phrases2, key=lambda x: sentence.index(x))
# 分词和词性标注
words = word_tokenize(sentence)
tagged_words = pos_tag(words)
# 定义一个名词短语的分块语法规则
grammar1 = r"""
NP: {<DT|JJ|NN.*>+}
"""
chunk_parser1 = RegexpParser(grammar1)
tree1 = chunk_parser1.parse(tagged_words)
# 提取名词短语
noun_phrases1 = []
for subtree in tree1.subtrees(filter=lambda t: t.label() == 'NP'):
noun_phrase = ' '.join(word for word, tag in subtree.leaves())
noun_phrases1.append(noun_phrase)
# 定义名词短语的语法规则,包括介词短语
grammar2 = r"""
NP: {<DT|JJ|NN.*>+<IN><DT|JJ|NN.*>+}
"""
chunk_parser2 = RegexpParser(grammar2)
tree2 = chunk_parser2.parse(tagged_words)
# 提取后跟介词短语的名词短语
noun_phrases2 = []
for subtree in tree2.subtrees():
if subtree.label() == 'NP':
noun_phrases2.append(" ".join(word for word, tag in subtree.leaves()))
# print('noun_phrases1: ', noun_phrases1)
# print('noun_phrases2: ', noun_phrases2)
if obj_part:
return noun_phrases1
return merge_(noun_phrases1, noun_phrases2)
def matching_object_from_sentence(text, sentence):
pos = sentence.index(text) # 字符串匹配
start_ = len(sentence[:pos].split(' ')) - 1
end_ = start_ + len(text.split(' '))
return start_, end_ # 返回单词位置索引
# 调用划分函数
results = extract_noun_phrases(sentence if not use_template else ' '.join(sentence.split(' ')[4:-3]))
# print(results)
output = [matching_object_from_sentence(text, sentence) for text in results]
return output