-
Notifications
You must be signed in to change notification settings - Fork 1
/
emp_analysis.py
68 lines (53 loc) · 1.94 KB
/
emp_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import count,avg
import os
print(os.environ['SPARK_HOME'])
if __name__ == '__main__':
print('inside python script for spark')
if (len(sys.argv) <2):
print('Provide data file path')
sys.exit(-1)
#build sparksession
spark = (SparkSession
.builder
.appName('PythonEmpAnalysis')
.getOrCreate())
spark.sparkContext.setLogLevel('ERROR')
#get Jobs.csv dataset filename from arguments
#C:\Users\krkusuk.REDMOND\Study\spark\Rural_Atlas_Update20\Jobs.csv
jobs_file = sys.argv[1]
'''
* Spark shell already executes everything
* till above. You can test the below code
* using pyspark shell.
'''
#download data from
#https://www.ers.usda.gov/data-products/atlas-of-rural-and-small-town-america/download-the-data/
#read csv
job_df = (spark.read.format("csv")
.option("header","true")
.option("inferSchema","true")
.load(jobs_file)
)
#count number of counties in states
print('Counties per states')
counties_count_df = (
job_df.select('State','County')
.groupBy('State')
.agg(count("County").alias("Counties"))
.orderBy("Counties", ascending = False)
)
counties_count_df.show(n = 10, truncate = False)
#calculate average unemployment rate by state
print('Unmeployement per states')
state_unemploymentrate_df = (
job_df.select('State', 'UnempRate2018')
.groupBy('State')
.agg(avg('UnempRate2018').alias('AvgUnempRate2018'))
.orderBy('AvgUnempRate2018', ascending = False)
)
state_unemploymentrate_df.show(10,truncate = False)
#calculate unemployment rate of washington state
print('Unemployment in Washington state')
state_unemploymentrate_df.filter(state_unemploymentrate_df.State=='WA').show()