-
Notifications
You must be signed in to change notification settings - Fork 2
/
build_tables.sql
309 lines (289 loc) · 6.2 KB
/
build_tables.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
/*
Data Analyst Job Postings
SQL Author: Jaime M. Shaker
Dataset Creator: Luke Barousse (https://www.linkedin.com/in/luke-b/) (https://www.youtube.com/@LukeBarousse)
Dataset Location: https://www.kaggle.com/datasets/lukebarousse/data-analyst-job-postings-google-search
Email: jaime.m.shaker@gmail.com or jaime@shaker.dev
Website: https://www.shaker.dev
LinkedIn: https://www.linkedin.com/in/jaime-shaker/
File Name: build_tables.sql
Description: This script will import data from the CSV files and create the
schema, tables and table relationships for this project. Once it is complete,
it will drop any unecessary schemas and tables.
*/
-- Create an schema and table for importing data
DROP SCHEMA import_data CASCADE;
CREATE SCHEMA IF NOT EXISTS import_data;
/*
* Github only allows file sizes smaller than 100mb. This dataset is at over 130mb on 2023-09-09.
* I split the CSV files into standard calendar quarters (Q1, Q2, Q3, Q4) for those that wish to use the CSV files on my repo.
*
*/
-- Import data from CSV files
-- Create Data Analyst Jobs Table
DROP TABLE IF EXISTS import_data.jobs;
CREATE TABLE import_data.jobs (
data_job_id TEXT,
idx TEXT,
title TEXT,
company_name TEXT,
job_location TEXT,
via TEXT,
description TEXT,
extensions TEXT,
job_id TEXT,
thumbnail TEXT,
posted_at TEXT,
schedule_type TEXT,
work_from_home TEXT,
salary TEXT,
search_term TEXT,
date_time TEXT,
search_location TEXT,
commute_time TEXT,
salary_pay TEXT,
salary_rate TEXT,
salary_avg TEXT,
salary_min TEXT,
salary_max TEXT,
salary_hourly TEXT,
salary_yearly TEXT,
salary_standardized TEXT,
description_tokens TEXT,
PRIMARY KEY (data_job_id)
);
-- gsearch_jobs 2022 CSV file.
COPY import_data.jobs (
data_job_id,
idx,
title,
company_name,
job_location,
via,
description,
extensions,
job_id,
thumbnail,
posted_at,
schedule_type,
work_from_home,
salary,
search_term,
date_time,
search_location,
commute_time,
salary_pay,
salary_rate,
salary_avg,
salary_min,
salary_max,
salary_hourly,
salary_yearly,
salary_standardized,
description_tokens
)
FROM '/var/lib/postgresql/source_data/csv/gsearch_jobs_2022.csv'
WITH DELIMITER ',' HEADER CSV;
-- gsearch_jobs_2023_q1 CSV file.
COPY import_data.jobs (
data_job_id,
idx,
title,
company_name,
job_location,
via,
description,
extensions,
job_id,
thumbnail,
posted_at,
schedule_type,
work_from_home,
salary,
search_term,
date_time,
search_location,
commute_time,
salary_pay,
salary_rate,
salary_avg,
salary_min,
salary_max,
salary_hourly,
salary_yearly,
salary_standardized,
description_tokens
)
FROM '/var/lib/postgresql/source_data/csv/gsearch_jobs_2023_q1.csv'
WITH DELIMITER ',' HEADER CSV;
-- gsearch_jobs_2023_q2 CSV file.
COPY import_data.jobs (
data_job_id,
idx,
title,
company_name,
job_location,
via,
description,
extensions,
job_id,
thumbnail,
posted_at,
schedule_type,
work_from_home,
salary,
search_term,
date_time,
search_location,
commute_time,
salary_pay,
salary_rate,
salary_avg,
salary_min,
salary_max,
salary_hourly,
salary_yearly,
salary_standardized,
description_tokens
)
FROM '/var/lib/postgresql/source_data/csv/gsearch_jobs_2023_q2.csv'
WITH DELIMITER ',' HEADER CSV;
-- gsearch_jobs_2023_q3 CSV file.
COPY import_data.jobs (
data_job_id,
idx,
title,
company_name,
job_location,
via,
description,
extensions,
job_id,
thumbnail,
posted_at,
schedule_type,
work_from_home,
salary,
search_term,
date_time,
search_location,
commute_time,
salary_pay,
salary_rate,
salary_avg,
salary_min,
salary_max,
salary_hourly,
salary_yearly,
salary_standardized,
description_tokens
)
FROM '/var/lib/postgresql/source_data/csv/gsearch_jobs_2023_q3.csv'
WITH DELIMITER ',' HEADER CSV;
-- Create working table with 'cleaned' data.
DROP SCHEMA IF EXISTS data_analyst CASCADE;
CREATE SCHEMA IF NOT EXISTS data_analyst;
-- Import data from CSV files
-- Create Data Analyst Jobs Table
DROP TABLE IF EXISTS data_analyst.jobs;
CREATE TABLE data_analyst.jobs (
data_job_id int UNIQUE,
idx int NOT NULL,
title TEXT NOT NULL,
company_name TEXT NOT NULL,
job_location TEXT,
via TEXT,
description TEXT,
extensions TEXT [],
job_id TEXT,
thumbnail TEXT,
posted_at TEXT,
schedule_type TEXT,
work_from_home boolean,
salary TEXT,
search_term TEXT,
date_time timestamp NOT NULL,
search_location TEXT,
commute_time TEXT,
salary_pay TEXT,
salary_rate TEXT,
salary_avg NUMERIC,
salary_min NUMERIC,
salary_max NUMERIC,
salary_hourly NUMERIC,
salary_yearly NUMERIC,
salary_standardized NUMERIC,
description_tokens TEXT [],
PRIMARY KEY (data_job_id)
);
INSERT INTO data_analyst.jobs (
data_job_id,
idx,
title,
company_name,
job_location,
via,
description,
extensions,
job_id,
thumbnail,
posted_at,
schedule_type,
work_from_home,
salary,
search_term,
date_time,
search_location,
commute_time,
salary_pay,
salary_rate,
salary_avg,
salary_min,
salary_max,
salary_hourly,
salary_yearly,
salary_standardized,
description_tokens
) (
SELECT
data_job_id::int,
idx::int,
lower(trim(title)) AS title,
lower(trim(company_name)) AS company_name,
lower(trim(job_location)) AS job_location,
-- Remove 'via ' from column
lower(trim(RIGHT(via, length(via) - 4))) AS via,
-- Remove Pilcrow character and replace with newline
trim(regexp_replace(description, E'[\\n]+', chr(13), 'g' )) AS description,
-- Format to a proper PostgreSQL Array
string_to_array(REGEXP_REPLACE(extensions, '[\[\]'']', '', 'g'), ', ') AS extensions,
job_id,
thumbnail,
posted_at,
schedule_type,
upper(work_from_home)::boolean AS work_from_home,
salary,
search_term,
date_time::timestamp,
search_location,
commute_time,
salary_pay,
salary_rate,
salary_avg::NUMERIC,
salary_min::NUMERIC,
salary_max::NUMERIC,
salary_hourly::NUMERIC,
salary_yearly::NUMERIC,
salary_standardized::NUMERIC,
-- Format to a proper PostgreSQL Array
string_to_array(REGEXP_REPLACE(description_tokens, '[\[\]'']', '', 'g'), ', ') AS description_tokens
FROM
import_data.jobs
);
-- Remove the posted_at value from the extensions column
UPDATE data_analyst.jobs
SET
extensions = array_remove(extensions, posted_at);
-- Drop import schema and table
DROP TABLE import_data.jobs;
DROP SCHEMA import_data CASCADE;