-
Notifications
You must be signed in to change notification settings - Fork 0
/
parent.py
118 lines (91 loc) · 4.49 KB
/
parent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from urllib.parse import unquote
import time
# Gathering data from Google search results using Selenium
# The goal is to get the names, roles, URLs of the LinkedIn profiles of Software Developers
# The search query is: "Software Developers" -intitle:"profiles" -inurl:"dir /" site:linkedin.com/in/ OR site:linkedin.com/pub/
# ------------------------------------------------------------------------------------------------------------------
url = "https://www.google.com/search?q=+%22Software+Developers%22 -intitle:%22profiles%22 -inurl:%22dir/+%22+site:linkedin.com/in/+OR+site:linkedin.com/pub/"
number_of_swipes = 50
# ------------------------------------------------------------------------------------------------
def extract_name_and_role(html):
soup = BeautifulSoup(html, 'html.parser')
span_tags = soup.find_all('span', {'jscontroller': 'msmzHf'})
urls = []
names = []
roles = []
for span in span_tags:
a = span.find('a', {'jsname': 'UWckNb'})
if a:
ping = a.get('ping')
if ping:
parts = ping.split('&')
for part in parts:
if part.startswith('url=') and "linkedin" in part:
url = part[4:]
url = unquote(url)
print(url)
urls.append(url)
h3 = span.find('h3')
if h3:
text = h3.text
text = text.replace('...','')
text = text.replace(',','')
parts = text.split(' - ')
if len(parts) > 1:
name = parts[0]
role = parts[1]
else:
name = text
role = 'Role not found'
names.append(name)
roles.append(role)
return urls, names, roles
# ------------------------------------------------------------------------------------------------------------------
# Gathering data from Google search results using Selenium
def gather_data_from_google_search():
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get(url)
# There might be a small verification required to prove that you are not a robot. You may need to pass it manually, thats why adding a sleep here.
time.sleep(15)
for _ in range(number_of_swipes):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# wait for the "More results" button to be present, and then click it
try:
more_results = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "RVQdVd")))
more_results.click()
except Exception as e:
print("More results button not found")
html = driver.page_source
driver.quit()
# # save the source code to a file
# # -------for debug purposes-------
# with open('search-result.html', 'w', encoding='utf-8') as f2:
# f2.write(html)
return html
# ------------------------------------------------------------------------------------------------------------------
# ------------------------------------------------------------------------------------------------------------------
# Processing the HTML to extract the data
# Some patterns were observed in the HTML that can be used to extract the data which are described in functions
def main():
html = gather_data_from_google_search()
# urls = extract_name_and_role(html)
urls, names, roles = extract_name_and_role(html)
with open('./data/data-mini.csv', 'w', encoding='utf-8') as file:
# file.write('Name,URL,Role\n')
for i in range(len(urls)):
file.write(f'{names[i]},{urls[i]},{roles[i]}\n')
# # print(urls)
with open('./data/data.csv', 'w', encoding='utf-8') as file:
file.write('Name,URL,Role,Current Workplace,Past Experience,Education,About\n')
print('Data extraction complete. Check data.csv for the results.')
if __name__ == "__main__":
main()