-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIndeedDictionaryByLocation.py
121 lines (94 loc) · 3.9 KB
/
IndeedDictionaryByLocation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import requests
import json
from bs4 import BeautifulSoup
from urllib.request import urlopen
def getDict(loc):
baselink = 'https://www.indeed.com/jobs?q=computer+science&l='
loc = loc = loc.replace(' ', '+')
link = (baselink + loc)
dicList = []
coList = []
titList = []
locList = []
dateList =[]
payList = []
linkList = []
skillList = []
descList = []
catList = []
source = requests.get(link).text
soup = BeautifulSoup(source, 'lxml')
#titles and links
for pos in soup.find_all('div', class_='title'):
titList.append(pos.a.get('title'))
link = pos.a.get('href')
linkList.append("https://www.indeed.com" + link)
#companies
for div in soup.find_all(name='div', attrs={'class':'row'}):
company = div.find_all(name='span', attrs={'class':'company'})
if len(company) > 0:
for b in company:
coList.append(b.text.strip())
else:
sec_try = div.find_all(name='span', attrs={'class':'result-link-source'})
for span in sec_try:
coList.append(span.text.strip())
#locations
spans = soup.findAll(['div', 'span'], attrs={'class': 'location'})
for span in spans:
locList.append(span.text)
#dates
spans = soup.findAll('span', attrs={'class': 'date'})
for span in spans:
dateList.append(span.text)
#pay
for div in soup.find_all(name='div', attrs={'class':'row'}):
try:
payList.append(div.find(name='span', attrs={'class':'salaryText'}).text.replace('\n', ''))
except:
try:
payList.append(div.find(name='span', attrs={'class':'sjcl'}).text.replace('\n', ''))
except:
payList.append('N/A')
dic ={}
length = len(titList)
#go into job links
for l in linkList:
#description
source = requests.get(l).text
soup = BeautifulSoup(source, 'lxml')
desc = soup.find('div', class_='jobsearch-jobDescriptionText')
desc = desc.text
descList.append(desc)
#skills
skills = ['python', 'java', 'c++', 'sql', 'manage', 'javascript', 'linux', 'team', 'problem solving', 'front end', 'back end', 'html', 'css','json', 'xml','api', 'linux', 'nodejs', 'c#', 'spark', 'sas', 'matlab', 'excel', 'spark', 'hadoop', 'azure', 'spss', 'git']
skillList1 = []
desc = desc.lower().split()
for i in desc:
if i in skills and i not in skillList1:
skillList1.append(i)
skillList.append(skillList1)
#category
aiKeys = ['ai', 'a.i.', 'artificial intelligence', 'artificial']
dlKeys= ['deep learning', 'neural networks', 'big data', 'deep', 'statistics']
mlKeys = ['data mining', 'machine learning', 'cnn', 'rbm', 'machine', 'natural language', 'regression', 'fault diagnosis', 'intrusion detection']
seKeys = ['software engineer', 'software development','code']
sumList = desc
for i in sumList:
if i in aiKeys:
catList.append('Artificial Intelligence')
for i in sumList:
if i in dlKeys:
catList.append('Deep Learning')
for i in sumList:
if i in mlKeys:
catList.append('Machine Learning')
for i in sumList:
if i in seKeys:
catList.append('Software Engineer')
else:
catList.append('Other')
for i in range (0, length):
dicList.append(dict({'Company':coList[i], 'Location': locList[i], 'Title': titList[i], 'Date Created': dateList[i], 'Salary': payList[i], 'Link':linkList[i],'Skills': skillList[i], 'Description': descList[i], 'Category': catList[i]}))
return (dicList)
getDict('chicago')