This repository has been archived by the owner on Nov 18, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathparse_to_lm.py
348 lines (290 loc) · 12.8 KB
/
parse_to_lm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
"""
This file converts the dataset sentences to my format to be used for
langauge modelling and use GPT insted of BERT models.
# SParC: Cross-Domain Semantic Parsing in Context
=================================================
Each file in train.json and dev.json contains the following fields:
```
question: the natural language question
question_toks: the natural language question tokens
database_id: the database id to which this interaction is addressed.
interaction: the query interaction including multiple DB query questions.
For each question in the interaction, it includes:
utterance: the natural language question
utterance_toks: the natural language question tokens
query: the SQL query corresponding to the question.
sql: parsed results of this SQL query using process_sql.py. Please refer to
the Spider Github page for the detailed documentation.
final: the final interaction query goal
utterance: the natural language question of the final interaction goal
query: the SQL query corresponding to the final interaction goal.
```
# Spider: A Large-Scale Human-Labeled Dataset for Complex and
Cross-Domain Semantic Parsing and Text-to-SQL Task
==================================================
Each file in train.json and dev.json contains the following fields:
```
question: the natural language question
question_toks: the natural language question tokens
db_id: the database id to which this question is addressed.
query: the SQL query corresponding to the question.
query_toks: the SQL query tokens corresponding to the question.
sql: parsed results of this SQL query using process_sql.py. Please refer to
parsed_sql_examples.sql in thepreprocess directory for the detailed documentation.
```
# Tables
========
tables.json contains the following information for each database:
```
db_id: database id
table_names_original: original table names stored in the database.
table_names: cleaned and normalized table names. We make sure the
table names are meaningful. [to be changed]
column_names_original: original column names stored in the database.
Each column looks like: [0, "id"]. 0 is the index of table names in
table_names, which is city in this case. "id" is the column name.
column_names: cleaned and normalized column names. We make sure the column
names are meaningful. [to be changed]
column_types: data type of each column
foreign_keys: foreign keys in the database. [3, 8] means column indices
in the column_names. These two columns are foreign keys of two different tables.
primary_keys: primary keys in the database. Each number is the index of column_names.
```
# CoSQL: A Conversational Text-to-SQL Challenge Towards
Cross-Domain Natural Language Interfaces to Databases
=====================================================
NO INFORMATION GIVEN ABOUT THIS ONE, BUT WE CAN STILL GET [table], [NL], [QUERY] triplets
"""
import os
import json
import numpy as np
import pandas as pd
import networkx as nx # each table is a graph
from argparse import ArgumentParser
import sentencepiece
from text2sql.data import format_sql, get_db_graph_string, parse_db_to_networkx
args = ArgumentParser(description="This file converts the dataset"
" sentences to my format to be used for "
"langauge modelling and use GPT insted of BERT models.")
args.add_argument("--data_folder", type=str, default="/Users/yashbonde/Desktop/AI/text2sql/data",
help="Folder with the extracted datasets")
args.add_argument("--vocab_size", type=int, default=4000, help="vocabulary size for sentence piece model")
args = args.parse_args()
# paths to main files
OTHER_FILE = os.path.join(args.data_folder, "spider/train_others.json")
SPIDER_FILE = os.path.join(args.data_folder, "spider/train_spider.json")
SPARC_FILE = os.path.join(args.data_folder, "sparc/train.json")
COSQL_FILE = os.path.join(args.data_folder, "cosql_dataset/cosql_all_info_dialogs.json")
# files containing tables info
SPIDER_TABLES = os.path.join(args.data_folder, "spider/tables.json")
SPARC_TABLES = os.path.join(args.data_folder, "sparc/tables.json")
COSQL_TABLES = os.path.join(args.data_folder, "cosql_dataset/tables.json")
# spider dataset already has sql files that we can read from to tokenize
SPIDER_SQL_TRAIN = os.path.join(args.data_folder, "spider/train_gold.sql")
SPIDER_SQL_DEV = os.path.join(args.data_folder, "spider/dev_gold.sql")
# dev set
SPIDER_DEV = os.path.join(args.data_folder, "spider/dev.json")
SPARC_DEV = os.path.join(args.data_folder, "sparc/dev.json")
# ---------------- CREATE PAIRS ---------------- #
data = []
dbs = []
test_train = []
with open(OTHER_FILE) as f1, open(SPIDER_FILE) as f2, open(SPARC_FILE) as f3,\
open(COSQL_FILE) as f4, open(SPIDER_DEV) as f5, open(SPARC_DEV) as f6:
# ========= SPIDER ========= #
# train_spider.json
for x in json.load(f2):
data.append((x["question"], x["query"], x["db_id"]))
dbs.append("train_spider")
test_train.append(1)
# spider_dev.json
for x in json.load(f5):
data.append((x["question"], x["query"], x["db_id"]))
dbs.append("test_spider")
test_train.append(0)
# train_others.json ======>>> SPIDER FOLDER
for x in json.load(f1):
data.append((x["question"], x["query"], x["db_id"]))
dbs.append("train_others")
test_train.append(1)
# ========= SPARC ========= #
# sparc/train.json
for x in json.load(f3):
data.append((x["final"]["utterance"], x["final"]["query"], x["database_id"]))
dbs.append("train_sparc")
test_train.append(1)
# SPARC_DEV.json
for x in json.load(f6):
data.append((x["final"]["utterance"], x["final"]["query"], x["database_id"]))
dbs.append("test_spark")
test_train.append(0)
# ========= COSQL ========= #
# cosql_all_info_dialogs.json
for x,y in json.load(f4).items():
data.append((y["query_goal"], y["sql"], y["db_id"]))
dbs.append("cosql_all")
test_train.append(1)
dataset_f = []
cols = ["question", "query", "db_id", "source", "train"]
for d, db, tt in zip(data, dbs, test_train):
# sample = (d[0], format_sql(d[1]), d[2], db)
try:
s = format_sql(d[1])
if "T1" in s or "t1" in s:
print("((", d[1].lower())
dataset_f.append((d[0], s, d[2], db, tt))
except:
print("))", d[1])
# create dataframe
df = pd.DataFrame(data=dataset_f, columns=cols)
# train/test split by DB ID not by authors
all_dbs = list(set(df.db_id.values))
train_dbs = set(np.random.choice(all_dbs, size = int(0.9 * len(all_dbs)), replace = False).tolist())
test_dbs = set([x for x in all_dbs if x not in train_dbs])
assert len(train_dbs | test_dbs) == len(all_dbs)
assert len(train_dbs & test_dbs) == 0
train_idx = [i for i,db_id in enumerate(df.db_id.values) if db_id in train_dbs]
train = []
for i in range(len(df)):
if i in train_idx:
train.append(1)
else:
train.append(0)
assert len(train) == len(df)
df.train = train
df.to_csv(os.path.join(args.data_folder, "all_questions.tsv"), sep="\t", index = False)
print(f"Save dataset at: {os.path.join(args.data_folder, 'all_questions.tsv')}")
# creating a single dump of all the table
all_schema = {}
with open(os.path.join(args.data_folder, SPARC_TABLES), "r") as f1,\
open(os.path.join(args.data_folder, SPIDER_TABLES), "r") as f2,\
open(os.path.join(args.data_folder, COSQL_TABLES), "r") as f3:
data = json.load(f1) # now load the
for x in data:
all_schema[x.pop("db_id")] = x
data = json.load(f2)
for x in data:
all_schema[x.pop("db_id")] = x
data = json.load(f3)
for x in data:
all_schema[x.pop("db_id")] = x
with open(os.path.join(args.data_folder,"all_schema.json"), "w") as f:
f.write(json.dumps(all_schema))
print(f"Found {len(all_schema)} schemas")
all_strings = []
for _id in all_schema:
all_strings.append(get_db_graph_string(parse_db_to_networkx(all_schema[_id])))
with open(os.path.join(args.data_folder, "all_sentences.txt"), "w") as f:
f.write("\n".join(df["query"].unique().tolist() +
df["question"].unique().tolist() +
all_strings))
sentencepiece.SentencePieceTrainer.train(f'''--input={os.path.join(args.data_folder, "all_sentences.txt")}\
--model_prefix=m --vocab_size=5000 --pad_id=1 --pad_piece=[PAD]\
--bos_id=2 --bos_piece=[BOS] --eos_id=3 --eos_piece=[EOS]\
--unk_id=4 --unk_piece=[UNK] --model_type=word''')
# """
# Below this was the old language modelling method which was a bad idea due to compute
# requirements. Instead we now use a better system.
# """
# with open(args.pairs, "w") as f:
# print(f"🕰 Saving Training pairs dataset at: {args.pairs}")
# s = "question\tquery\tdb_id\n"
# for x in data:
# x = list(map(lambda s: re.sub("\s+", " ", s), x))
# s += "\t".join(x) + "\n"
# f.write(s)
# # ---------------- CREATE PAIRS (DEV) ---------------- #
# data = []
# with open(SPIDER_DEV) as f1, open(SPARC_DEV) as f2:
# # train_others.json
# for x in json.load(f1):
# data.append((x["question"], x["query"], x["db_id"]))
# # sparc/train.json
# for x in json.load(f2):
# data.append((x["final"]["utterance"], x["final"]
# ["query"], x["database_id"]))
# with open(args.dev_pairs, "w") as f:
# print(f"🕰 Saving Dev. pairs dataset at: {args.dev_pairs}")
# s = "question\tquery\tdb_id\n"
# for x in data:
# x = list(map(lambda s: re.sub("\s+", " ", s), x))
# s += "\t".join(x) + "\n"
# f.write(s)
# # ---------------- CREATE TABLES ---------------- #
# table_date = []
# with open(SPIDER_TABLES) as f1, open(SPARC_TABLES) as f2, open(COSQL_TABLES) as f3:
# table_date.extend(json.load(f1)) # spider/tables.json
# table_date.extend(json.load(f2)) # sparc/tables.json
# table_date.extend(json.load(f3)) # cosql_dataset/tables.json
# table_strings = []
# for didx, d in enumerate(table_date):
# fkeys_list = [[] for _ in range(len(d["column_names_original"]))]
# for i, col in enumerate(d["column_names_original"]):
# keys_connected_to_this_col = deepcopy(list(filter(
# lambda f: i in f, d["foreign_keys"]
# )))
# if not keys_connected_to_this_col:
# continue
# con = []
# for k in keys_connected_to_this_col:
# k = [j for j in k if j != i]
# con.append(k[0])
# fkeys_list[i].extend(con)
# primary_keys = [0 for _ in range(len(d["column_names_original"]))]
# for i in d["primary_keys"]:
# primary_keys[i] = 1
# cols = [(*x, d["column_types"][i], primary_keys[i], *fkeys_list[i])
# for i, x in enumerate(d["column_names_original"])]
# tables = list(set([x[0] for x in d["column_names_original"]]))
# agg_ = [list(filter(
# lambda x: x[0] == tid, cols
# )) for tid in tables]
# string = ""
# for x in agg_:
# s = []
# for y in x[:-1]:
# y = list(map(str, y))
# s.append("[col] " + " ".join(y[1:]))
# string += " [table] " + " ".join(s)
# s = f"{didx}\t{d['db_id']}\t{string.strip()}"
# table_strings.append(s)
# with open(args.tables, "w") as f:
# print(f"🕰 Saving tables at: {args.pairs}")
# s = "id\ttable_name\tstring\n"
# s += '\n'.join(table_strings)
# f.write(s)
# # ---------------- CREATE LM CORPUS ---------------- #
# # first get a mapping like {<db_name>: <table_string>}
# with open(args.tables) as f:
# t = [x.strip() for x in f.readlines()]
# table_strs = {}
# for item in t[1:]:
# _, db_name, table_string = item.split("\t")
# table_strs[db_name] = table_string
# # now get all the question-query pairs
# with open(args.pairs) as f:
# p = [x.strip() for x in f.readlines()]
# triplets = []
# for item in p[1:]:
# question, query, db_name = item.split("\t")
# tstr = table_strs[db_name]
# triplets.append(f"{tstr} [question] {question} [query] {query}")
# with open(args.lm_corpus, "w") as f:
# print(f"🕰 Saving LM Corpus at {args.lm_corpus}")
# f.write("\n".join(triplets))
# # make the tokenizer if needed
# if args.fresh_tokenizer:
# with open(args.tables, "r") as t, open(args.pairs, "r") as p, open(args.dev_pairs, "r") as d:
# table_strings = [x.split("\t")[-1].strip() for x in t.readlines()[1:]]
# pair_strings = []
# for x in p.readlines()[1:]:
# x = x.split("\t")[:-1]
# pair_strings.extend((x[0].strip(), x[1].strip()))
# dev_strings = []
# for x in d.readlines()[1:]:
# x = x.split("\t")[:-1]
# dev_strings.extend((x[0].strip(), x[1].strip()))
# final = table_strings + pair_strings + dev_strings
# with open(args.corpus, "w") as c:
# print(f"🕰 Saving Tokenizer Corpus at {args.corpus}")
# c.write("\n".join(final))