-
Notifications
You must be signed in to change notification settings - Fork 0
/
Hive_Data processing.txt
493 lines (411 loc) · 16.1 KB
/
Hive_Data processing.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
XML DATA PROCESSING IN HIVE:
-----------------------------
let say we have a sample xml data
<rec>
<name>Hamza</name>
<age>25</age>
</rec>
<rec>
<name>Ali</name>
<gender>M</gender>
</rec>
if the above file is loaded into the hive table then there will be 8 records or rows, beacuse hive process
the data line by line
we need to transfer each vertical xml record into horizantal xml reocrd
you can use MapReduce or Spark for this purpose
after the conversion the file will look like this
<rec><name>Hamza</name><age>25</age></rec>
<rec><name>Ali</name><gender>M</gender></rec>
<rec><name>Asim</name><age>22</age><gender>M</gender></rec>
this type of transformation is done before bringing the data into the hive table
once the data is transferred you can use hive built in xml parsers to load the xml data, youu dont need
to write any complicated code for it
we then do the following steps in hive
create database xmls;
use xmls;
create table raw(line string);
load data local inpath 'xml1' into table raw;
select xpath_string(line,'rec/name') from raw;
RESULT:
Hamza
Ali
Asim
//xapth is built xml template praser
select xpath_int(line,'rec/age') from raw;
RESULT:
25
0
create table info(name string,age int,gender string)
row format delimited
fields terminated by ',';
then we will load data into this table using
insert into table info
select xpath_string(line,'rec/name'),
xpath_int(line,'rec/age'),
xpath_int(line,'rec/sex'),
from raw;
//////////////////////////////////////////////
we can also work on some more complex data
<rec><name><fname>Zubair</fname><lname>Ashraf></lname></name><age>23</age><contact><email><personal>abc@gmail.com</personal><offical>abc@offical.com</offical></email><phone><mobile>123</mobile><office>332</office><residence>432</residence/contact><city>Lahore</city></rec>
create table xraw(line string);
create table xinfo(fname string,lname string,age int,personal_email string,offical_email string,mobile string,office_phone string,residence_phone string,city string);
another type of xml file you can insert is
<rec><name>Hamza</name><qual>BSCS</qual><qual>MSCS</qual></rec>
<rec><name>Ali</name><qual>BSCS</qual><qual>MSCS</qual><qual>PHD</qual></rec>
for this type of data use
select xpath(line,'rec/qual/text()') from xmlraw;
now as the qual is returning a array so in the table we have to make a array
create table raw2(name string,qual array<string>);
insert overwrite table raw2
> select xpath_string(line,'rec/name'),
> xpath(line,'rec/qual/text()') from xmlraw;
select name,size(qual) from raw2;
select * from raw2 where array_contains(qual,'BSCS');
if you want to know how many people have done BSCS?
how many times BSCS or MSCS?
you cant do it direcctly with the arrays
select explode(qual) as q from raw2;
select name,myq from raw2 lateral view explode(qual) q as myq;
how what if we want to calculate the total bill of the custtomer on the following data
cat > xml4
<tr><cid>101</cid><pr>1000</pr><pr>2000</pr><pr>4000</pr></tr>
<tr><cid>102</cid><pr>3000</pr><pr>2000</pr></tr>
<tr><cid>101</cid><pr>5000</pr>
create table sraw(line string);
load data local inpath 'xml4' into table sraw;
as the xpath returns the price in string form so in the schema there will be array of string
create table sraw2(cid int, pr array<string>);
insert into table sraw2
> select xpath_int(line,'tr/cid'),
> xpath(line,'tr/pr/text()') from sraw;
to flaten the things we use explode function
select explode(pr) as p from sraw2;
from this query we can only get the price, but we cant get the customer id, but as we know that along with the UDTF
functions we cannot use another column so for this purpose we use lateral view
as we will flat the things so in this table schema we will use pr as int
create table sales(cid int, pr int)
> row format delimited
> fields terminated by ',';
the column mypr is generated through the lateral view
insert into table sales
> select cid,mypr from sraw2
> lateral view explode(pr) p as mypr;
now atfer this we can get results
create table results(cid int,tot int);
now to get the total bill of each customer use the following query
insert into table results
> select cid,sum(pr) from sales group by cid;
let us say that your xml tag has parameters how will you handle that data
samle data:
cat > xml5
<tr><cid>101</cid><pr id="p1">4000</pr><pr id="p7>5000</pr></tr>
<tr><cid>102</cid><pr id="p7">5000</pr><pr id="p2">3500</pr></tr>
now we will see how to extract these parameters from the tags
create table xxraw(line string);
load data local inpath 'xml5' into table xxraw;
so to get the parameter under the tag use the following query
select xpath(line,'tr/pr/@id') from xxraw;
JSON DATA PROCESSING:
------------------------------------------------
{
"name:"hamza",
"age":25
}
{
"name":"ali",
"city":"Lahore"
}
just like xml if we isnert this data then there will be 8 records in hive, so we have to transfer this vertical record
into horizontal record
json1:
{"name:"hamza","age":25}
{"name":"ali","city":"Lahore"}
then we will do the following
create database jsons;
use jsons;
create table raw(line string);
load data local inpath 'json1' into table raw;
two functions-->to extract json fields
1.get_json_object()-->this is udf,get a particular field
2.json_tuple()------->this is udtf, this is also applied with lateral view like explode, get sequence of fields
two way to extract json fields
1.select get_json_object(line,'$.name') from raw;
to get all the columns you can use
select get_json_object(line,'$.name'),
-> get_json_object(line,'$.age'),
-> get_json_object(line,'$.city') from raw;
2. x.* this indcates multiple views
select x.* from raw
lateral view json_tuple(line,'name','age','city') x as n,a,c;
insert overwrite table info
> select x.* from raw
> lateral view json_tuple(line,'name','age','city') x as n,a,c;
so the difference is that in the first way you have separate function for every column but in the second way you
have only one function for every column
......now to handle nested json objects............
{
"name":"ali",
"age":25,
"wife":{
"name":"annie",
"age":22,
"city":"Lahore
}
"city":"Lahore"
}
convert
{"name":"ali","age":25,"wife":{"name":"annie","age":22,"city":"Lahore},city":"Lahore"}
{"name":"asim","age":23,"wife":{"name":"ayehsa","age":20,"city":"Lahore},city":"Lahore"}
load this data into table jraw
create table raw2(name string ,age int,wife string,city string);
insert into table raw2
> select x.* from jraw
> lateral view json_tuple(line,'name','age','wife','city') x as n,a,w,c:
after this data will still not be fully raw structured some data will be in json we will create another table
create table jinfo(hname string,wname string,hage int,wage int,hcity string,wcity string,wqual string)
> row format delimited
> fields terminated by ',';
insert into table jinfo
> select name, get_json_object(wife,'$.name),
> age, get_json_object(wife,'$.age),
> city, get_json_object(wife,'$.city),
> get_json_object(wife,'$.qual) from raw2;
then select * from jinfo
after working witht the nested json now we will work with json that will have collections
these are basically json arrays
make a file of sample data
{"name":"Hamza","qual":["BBIT","MSCS","MSDS"]}
{"name":"Ali","qual":["BBIT","MSCS"]}
{"name":"Watto","qual":["BBA","MBA","MPhill"]}
create table jsraw(line string);
and load the above file data into this table
in the jsraw2 table schema the qual is string because in json the data in left and right square bracket is consider
as a single string
create table jsraw2(name string,qual string);
then insert data into this table
insert into table jsraw2 select x.* from jsraw lateral view json_tuple(line,'name','qual') x as n,q;
data will be in this shape:
Hamza ["BBIT","MSCS","MSDS"]
Ali ["BBIT","MSCS"]
Watto ["BBA","MBA","MPhill"]
keep in mind that the second column is stored as a single string and not a array
but the comma separator tells us that their are three qualification
now to get the array we will split,but as the "" are part of our string then it will mask it with \
select split(qual,',') from jsraw2;
after spliting the result is actually a array and not a string
RESULT:
["[\"BBIT\"","\"MSCS\"","\"MSDS\"]"]
["[\"BBIT\"","\"MSCS\"]"]
["[\"BBA\"","\"MBA\"","\"MPhill\"]"]
we will store this array into another table thats why in the schema of raw3 the qualification is array
create table raw3(name string,qual array<string>);
then insert the data
insert into table raw3 select name,split(qual,',') from jsraw2;
after loading the data into this table it is still not in pure form and we want to flatten,for this purpose we will use
explode
the shadp of data after using explode will be like this
select explode(qual) as q from raw3;
RESULT:
["BBIT"
"MSCS"
"MSDS"]
["BBIT"
"MSCS"]
["BBA"
"MBA"
"MPhill"]
now we will create another table
create table raw4(name string,qual string);
and then insert the data as
insert overwrite table raw4 select name,myq from raw3 lateral view explode(qual) q as myq;
RESULT:
Hamza ["BBIT"
Hamza "MSCS"
Hamza "MSDS"]
Ali ["BBIT"
Ali "MSCS"]
Watto ["BBA"
Watto "MBA"
Watto "MPhill"]
after inserting you can still see that some unnecessary data is their
now if you split this data you will get pure qualification in array as second element because when we used explode
let say for hamza,if we use "" as delimitore then the first element we get is below
["[\"BBIT\"","\"MSCS\"","\"MSDS\"]"] vedio(14:00)
Hamza ["BBIT"------------------->"[" BBIT
Hamza "MSCS"-------------------->"" MSCS
Hamza "MSDS"] ------------------>"" MSDS "]"
now if we use "" as delimitor see the result
select split(qual,'""') from raw4;
you get array
["[\"BBIT\""]
["\"MSCS\""]
["\"MSDS\"]"]
["[\"BBIT\""]
["\"MSCS\"]"]
["[\"BBA\""]
["\"MBA\""]
["\"MPhill\"]"]
now from this array we can see that the second element is the qualification and in hive we can get it through index
select split(qual,'"')[1] from raw4;
BBIT
MSCS
MSDS
BBIT
MSCS
BBA
MBA
MPhill
create table jsinfo like raw4;
insert into table jsinfo select name,split(qual,'"')[1] from raw4;
select * from jsinfo;
OK
Hamza BBIT
Hamza MSCS
Hamza MSDS
Ali BBIT
Ali MSCS
Watto BBA
Watto MBA
Watto MPhill
now see the transformation of data
Stage1:
....................................................
select * from jsraw;
OK
{"name":"Hamza","qual":["BBIT","MSCS","MSDS"]}
{"name":"Ali","qual":["BBIT","MSCS"]}
{"name":"Watto","qual":["BBA","MBA","MPhill"]}
.....................................................
Stage2:
select * from jsraw2;
OK
Hamza ["BBIT","MSCS","MSDS"]
Ali ["BBIT","MSCS"]
Watto ["BBA","MBA","MPhill"]
.....................................................
Stage3:
select * from raw3;
OK
Hamza ["[\"BBIT\"","\"MSCS\"","\"MSDS\"]"]
Ali ["[\"BBIT\"","\"MSCS\"]"]
Watto ["[\"BBA\"","\"MBA\"","\"MPhill\"]"]
.....................................................
Stage4:
select * from raw4;
OK
Hamza ["BBIT"
Hamza "MSCS"
Hamza "MSDS"]
Ali ["BBIT"
Ali "MSCS"]
Watto ["BBA"
Watto "MBA"
Watto "MPhill"]
......................................................
Stage5:
select * from jsinfo;
OK
Hamza BBIT
Hamza MSCS
Hamza MSDS
Ali BBIT
Ali MSCS
Watto BBA
Watto MBA
Watto MPhill
......................................................
URL DATA PROCESSING:
-----------------------------------------------------------------------------------------------------------------------
when user visit from one page to another page moslty web logs are created and those web log contains urls
basically url are off two types
1.source urls
2.target urls
hive provides you with predefined url parsers to extract information from them
url has three parts
<host>/<path>?<query string>
file name:urls
http://training.com/bigdata/hadoop?id=101&name=Hamzai&age=23&city=hyd
http://training.com/bigdata/spark?id=102&name=Ali&gender=f&city=lhr
http://training.com/bigdata/spark?id=103&name=Asim&age=33&gender=f
http://training.com/bigdata/spark?id=101&name=Qumail&age=23&gender=m
from these we can exrtact data on which product page to advertise on the base of customer visited as we can see from the
the urls that spark is the most visited product
ek or swal uth skta ha k jese kis age k log product ko visit kr rhy hein us hisab se adds lgae ja skte hein
jis gender k log visit kr rhy hein male ya female uska add bhi lgaya ja skta ha
create database urls;
create table raw(line string);
load data local inpath 'urls' into table raw;
now we will first extract the hostname from the url
select parse_url(line,'HOST') from raw;
RESULT:
training.com
training.com
training.com
training.com
select parse_url(line,'PATH') from raw;
RESULT:
/bigdata/hadoop
/bigdata/spark
/bigdata/spark
/bigdata/spark
select parse_url(line,'QUERY') from raw;
RESULT:
OK
id=101&name=Hamzai&age=23&city=hyd
id=102&name=Ali&gender=f&city=lhr
id=103&name=Asim&age=33&gender=f
id=101&name=Qumail&age=23&gender=m
select parse_url(line,'HOST'),
> parse_url(line,'PATH'),
> parse_url(line,'QUERY') from raw;
OK
training.com /bigdata/hadoop id=101&name=Hamzai&age=23&city=hyd
training.com /bigdata/spark id=102&name=Ali&gender=f&city=lhr
training.com /bigdata/spark id=103&name=Asim&age=33&gender=f
training.com /bigdata/spark id=101&name=Qumail&age=23&gender=m
if you dont want to use parse_url again and again you can use the following
select x.* from raw
>lateral view parse_url_tuple(line,'HOST','PATH','QUERY') x as h,p,q;
then put all these things in a single table
create table raw2(host string,path string,query string);
then
insert into table raw2
>select x.* from raw
>lateral view parse_url_tuple(line,'HOST','PATH','QUERY') x as h,p,q;
RESULT:
training.com /bigdata/hadoop id=101&name=Hamzai&age=23&city=hyd
training.com /bigdata/spark id=102&name=Ali&gender=f&city=lhr
training.com /bigdata/spark id=103&name=Asim&age=33&gender=f
training.com /bigdata/spark id=101&name=Qumail&age=23&gender=m
now i want to get my product from the path /bigdata/spark and my clue to separate both is delimitor /, in this bigdata is
the category and hadoop,spark are the prodcuts and we can put them in a single array so the data type for it is array
id=101&name=Hamzai&age=23&city=hyd if we carefully observe this then we can see that it is delimited by & and it is a
key value pair and data type for key value pair in hive is map
then for the next stage of transformation we are going to create another table
create table raw3(host string,path array<string>,qmap map<string,string>);
there is a function in hive to convert string into map str_to_map(attribute,'delimitor','dekimitor for key value')
insert into table raw3
select host, split(path,'/'), str_to_map(query,'&','=') from raw2;
RESULT:
training.com ["","bigdata","hadoop"] {"id":"101","name":"Hamzai","age":"23","city":"hyd"}
training.com ["","bigdata","spark"] {"id":"102","name":"Ali","gender":"f","city":"lhr"}
training.com ["","bigdata","spark"] {"id":"103","name":"Asim","age":"33","gender":"f"}
training.com ["","bigdata","spark"] {"id":"101","name":"Qumail","age":"23","gender":"m"}
as we know that we access array with index and key value pair with key
now create another table with schema
create table info(host string,category string,course string,id int,name string,age int,gender string,city string);
as our course and category is in path attribute we will acces it like path[1] and path[2]
and if you pass the key you will get the value qmap['id']
insert overwrite table info
> select host,path[1],path[2],
> qmap['id'],qmap['name'],qmap['age'],qmap['gender'],qmap['city'] from raw3;
training.com bigdata hadoop 101 Hamzai 23 NULL hyd
training.com bigdata spark 102 Ali NULL f lhr
training.com bigdata spark 103 Asim 33 f NULL
training.com bigdata spark 101 Qumail 23 m NULL
now apply this query for analysis
select gender,count(*) as cnt from info
group by gender
order by cnt desc limit 1;
this query will tell which gender is watching most
RESULT: f 2