-
Notifications
You must be signed in to change notification settings - Fork 2
/
main.py
136 lines (122 loc) · 5.17 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from Scripts.verbalizationAbstraction import VerbalizationAbstaction as VA
from Scripts.questionGeneration import questionGeneration as QG
from Scripts.pipeline import generlizationPipeline as GP
from Scripts.questionMapping import questionMapping as QM
from Scripts.questionReduction import CQClustering, ParaphraseDetection
from sentence_transformers import SentenceTransformer
import ijson
import json
import os
import pandas as pd
def readingJson(Path, theme_label):
with open(Path, 'r') as f:
parser = ijson.items(f, 'item')
# Extract a set of items that have a specific value for the 'theme_label' key
theme_label = theme_label
items = []
for item in parser:
if item.get('theme_label') == theme_label:
if item not in items:
items.append(item)
return items
def simpleInterface():
theme = {
'1': 'Airport',
'2': 'Artist',
'3': 'Astronaut',
'4': 'Athlete',
'5': 'Building',
'6': 'CelestialBody',
'7': 'ChemicalCompound',
'8': 'City',
'9': 'ComicsCharacter',
'10': 'Food',
'11': 'MeanOfTransportation',
'12': 'Monument',
'13': 'Mountain',
'14': 'Painting',
'15': 'Politician',
'16': 'SportsTeam',
'17': 'Street',
'18': 'Taxon',
'19': 'University',
'20': 'WrittenWork'
}
print('Please select a theme_label from the list below:')
for key, value in theme.items():
print(key, value)
while True:
theme_input = input()
if theme_input in theme.keys() or theme_input == '0':
break
else:
print('Please select a theme_label from the list')
if theme_input == '0':
theme_labels = list(theme.values())[:-1] # 'All' 제외
else:
theme_labels = [theme[theme_input]]
print('Please input a reading limit. If you want to test all data, then type "0"')
while True:
readingLimit_input = input()
if readingLimit_input >= '0':
break
else:
print('Please input integer from 0')
return theme_labels, int(readingLimit_input)
def listingQuestions(inputdata_path):
with open(inputdata_path, 'r') as file:
data = json.load(file)
# Open the output text file for writing
with open('Data/Temp/questions.txt', 'w') as file:
# Iterate over each item in the JSON data
for item in data:
# Extract the questions and write them to the file
questions = [
item.get('propertyCQ'),
item.get('objectCQ'),
item.get('generalizedPropertyCQ'),
item.get('generalizedObjectCQ')
]
# Write each question to the file on a new line without double quotations
for question in questions:
if question: # check if question is not None
file.write(question + '\n')
if __name__ == "__main__":
directory_path = 'Data/Temp'
if not os.path.exists(directory_path):
os.makedirs(directory_path)
#raw data loading with a specific theme_label
theme_labels, readingLimit = simpleInterface()
for theme_label in theme_labels:
rawData = readingJson('Data/WDV_dataset.json', theme_label)
#### PROCESS: VA -> QG -> GP ####
#1. Run verbalizationAbstraction.py with parsed data with a specific theme_label
if readingLimit == 0:
Prunned_rawData = rawData
else:
Prunned_rawData = rawData[:readingLimit]
VA(Prunned_rawData, theme_label) #save the results in the JSON file "Data/Temp/VerbalizationAbstraction.json"
#2. Run questionGeneration.py with parsed data with a specific theme_label
verbalData = readingJson(f'Data/Temp/verbalizationAbstraction-{theme_label}.json', theme_label)
QG(verbalData, theme_label) #save the results in the JSON file "Data/Temp/questionGeneration.json"
#3. Run pipeline.py with parsed data with a specific theme_label
questionData = readingJson(f'Data/Temp/questionGeneration-{theme_label}.json', theme_label)
GP(questionData, theme_label) #generalization for above QG
#4. Run questinMapping.py with BigCQ dataset
generalizedQuestions = readingJson(f"Data/Temp/generalizedQuestion-{theme_label}.json", theme_label)
QM(questionData, theme_label) #generalization for above QG
#5. Run questionReduction.py with generated questions
#Candidate model list
listingQuestions(f"Data/Temp/generalizedQuestion-{theme_label}.json")
model_st1 = SentenceTransformer('all-mpnet-base-v2')
model_st2 = SentenceTransformer('all-MiniLM-L6-v2')
model_st3 = SentenceTransformer('paraphrase-mpnet-base-v2')
model_st4 = SentenceTransformer('paraphrase-MiniLM-L3-v2')
with open('Data/Temp/questions.txt') as f:
questions = f.readlines()
clustering_results = CQClustering(questions, model_st2)
print(clustering_results)
qpp_data = pd.read_csv("Data/qqp.tsv", sep="\t")
reading_limits = 5000 #preventing exhausting reading and processing
qpp_data = qpp_data[:reading_limits]
ParaphraseDetection(qpp_data, model_st2)