-
Notifications
You must be signed in to change notification settings - Fork 1
/
LinkedInProfileUtil.py
94 lines (71 loc) · 2.81 KB
/
LinkedInProfileUtil.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# import pprint
import requests as req
# from bs4 import BeautifulSoup as bs
from config.search_credentials import GOOGLE_CUSTOM_SEARCH_API_KEY, CUSTOM_SEARCH_ENGINE_ID
import json
# REQUEST_URL= "https://www.googleapis.com/customsearch/v1"
# params = {
# "key": GOOGLE_CUSTOM_SEARCH_API_KEY,
# "cx": CUSTOM_SEARCH_ENGINE_ID,
# "q": "l-birnbaum@northwestern.edu"
# }
# pp = pprint.PrettyPrinter(indent=4)
# r = req.get(REQUEST_URL, params=params)
# results = r.json()
# # print json.dumps(results, indent=2)
# profiles = [{
# "profile_url":item["formattedUrl"],
# # "info":item["pagemap"]["person"][0],
# "hcard": item["pagemap"]["hcard"][0]
# }
# for item in results["items"] if "www.linkedin.com/in" in item["formattedUrl"]]
# profiles = [ item for item in results["items"] if "www.linkedin.com/in" in item["formattedUrl"]]
# print json.dumps(profiles, indent=2)
# this works ^
def get_linkedin_profiles_by_query(query):
REQUEST_URL= "https://www.googleapis.com/customsearch/v1"
params = {
"key": GOOGLE_CUSTOM_SEARCH_API_KEY,
"cx": CUSTOM_SEARCH_ENGINE_ID,
"q": query
}
r = req.get(REQUEST_URL, params=params)
# print(r)
results = r.json()
profiles = None
try:
profiles = [ {"profile_url":item["formattedUrl"], "hcard": item["pagemap"]["hcard"][0] } for item in results["items"] if "www.linkedin.com/in" in item["formattedUrl"]]
except:
# print "Something went wrong! The result of the request was:"
# print(json.dumps(results, indent=2))
pass
return profiles
def get_job_title_from_linked_in(name):
# print("get_job_title_from_linked_in", name)
job_title = "No Job Title Found"
linkedin_profiles = get_linkedin_profiles_by_query(name)
if linkedin_profiles:
# print(linkedin_profiles)
hcard = linkedin_profiles[0]['hcard']
if 'title' in hcard:
job_title = hcard['title']
return job_title
# This doesn't work, linkedin's gotten a lot smarter about preventing scraping.
# def get_linkedin_profile_details(profile_url):
# headers = {
# 'User-Agent': "Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3",
# # 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
# # 'accept-encoding': 'gzip, deflate, sdch, br'
# }
# res = req.get(profile_url, headers=headers)
# data = res.text
# print data
# # print res.headers
# # print dir(res)
# # print res.status_code
# soup = bs(data, features="html.parser")
# print soup.find_all("div", id="summary")
# print profiles[0]["profile_url"]
# get_linkedin_profile_details(profiles[0]["profile_url"])
# import json, sys
# print json.dumps(get_linkedin_profiles_by_query(sys.argv[1]), indent=2)