-
Notifications
You must be signed in to change notification settings - Fork 0
/
helpers.py
291 lines (225 loc) · 9.21 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
from collections import Counter
import sys
import csv
from google import google
from concurrent.futures import ThreadPoolExecutor
import csv
import json
from multiprocessing import Pool
from sortedcontainers import SortedSet
import difflib
from urllib.request import urlopen
from urllib.parse import quote_plus
import json
import re
from threading import Thread
import matplotlib.pyplot as plt
import numpy as np
import os
import time
from google import google
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import requests
from tornado_fetcher import Fetcher
from bs4 import BeautifulSoup
import re
pattern = re.compile("[\w]+")
columns=['google_categories','partner_categories']
cities = ['', 'Sydney, Australia', 'Toronto, Canada', 'London, England', 'Bengaluru, India', 'Auckland, New Zealand', 'Florida, United States']
fetcher = initialize_fetcher()
def get_data(filepath):
with open(filepath, "r", encoding="utf-8") as file:
return SortedSet(file.read().lower().splitlines())
def write_result(google_categories, partner_categories):
with open(csv_filename, 'a+') as csv_file:
writer = csv.writer(csv_file)
for g_cat, p_cat in zip(google_categories, partner_categories):
writer.writerow([g_cat, p_cat])
def clear_csv():
csv_file.close()
open(csv_filename, 'w').close()
def get_close_matches(difference, partner_categories_1, cutoff=0.96):
matches_list = []
for category in difference:
matches = difflib.get_close_matches(str(category), partner_categories_1, n=3, cutoff=cutoff)
if len(matches):
# print(category, matches)
matches_list.append((category, matches))
return pd.DataFrame(matches_list, columns=columns)
def get_synonyms(category):
# category = category.translate(translationTable)
category = quote_plus(re.sub(r'[^\x00-\x7f]', '', category))
url = 'http://api.datamuse.com/words?ml={}'.format(category)
# print(url)
r = urlopen(url)
data = json.loads(r.read().decode('utf-8'))
return [d['word'] for d in data][:5]
def initialize_fetcher():
return Fetcher(
user_agent='Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', # user agent
phantomjs_proxy='http://localhost:1234', # phantomjs url
pool_size=10, # max httpclient num
async=False
)
def restart_phantomjs():
time.sleep(5)
os.system("sudo pkill -9 phantomjs")
os.system("nohup phantomjs phantomjs_fetcher.js 1234 &")
time.sleep(3)
def get_random_cities(count):
restart_phantomjs()
url = "https://www.randomlists.com/random-world-cities?qty={}#".format(count)
response = fetcher.phantomjs_fetch(url)
soup = BeautifulSoup(response['content'], 'html.parser')
cities = soup.select(".rand_medium")
countries = soup.select(".rand_small")
return ["{}, {}".format(city.get_text(), country.get_text()) for city, country in zip(cities, countries)]
def fetch_businesses(category):
fetcher = initialize_fetcher()
biz_list = []
category = quote_plus(re.sub(r'[^\x00-\x7f]', '', category))
for city in cities:
url = 'https://www.google.com/search?tbm=lcl&q={} {}'.format(category, city)
# print(url)
response = fetcher.phantomjs_fetch(url)
soup = BeautifulSoup(response['content'], 'html.parser')
biz = [element.get_text() for element in soup.select("div[role=heading]")]
biz_list += biz
return biz_list
# map those categories to businesses
def fetch_businesses_from_list(matches):
mapping = {}
for i, category in enumerate(matches):
if i % 3 == 0:
restart_phantomjs()
business_list = fetch_businesses(category)
mapping.update({category: business_list})
print(i, category)
return mapping
def find_word_in_list(word, categories):
matches = []
for category in categories:
if word in category:
# print(category)
matches.append(category)
return matches
def save_businesses_to_file(categories, filename):
mappings = fetch_businesses_from_list(categories)
with open(filename, "w") as json_file:
json.dump(mappings, json_file, indent=4)
print("################## STORED ##############")
def load_categories(filename):
with open(filename) as f:
return json.loads(f.read())
def fetch_business_categories(category):
fetcher = initialize_fetcher()
category_list = []
formatted_category = quote_plus(re.sub(r'[^\x00-\x7f]', '', category))
for city in cities:
url = 'https://www.google.co.in/search?tbm=lcl&q={} {}'.format(formatted_category, city)
# url = "https://www.google.com/search?tbm=lcl&q=zoo"
# print(url)
response = fetcher.phantomjs_fetch(url)
soup = BeautifulSoup(response['content'])
try:
categories = [element.select("div:nth-of-type(1)")[0].get_text().split('·')[-1].strip().lower() for element in soup.find_all("span", class_="rllt__details")]
except:
categories = []
category_list += categories
return (category, category_list)
def get_parent_category(category_map):
# category_list = fetch_business_categories(category)
category, category_list = category_map
if not category_list:
return ''
counter = Counter(category_list)
if len(counter) > 1:
sored_counter = sorted(counter, key=counter.get)
if sorted(counter.values())[-2] > 2:
candidate = sored_counter[-2].lower()
else:
return max(counter, key=counter.get).lower()
return max(counter, key=counter.get).lower() if category == candidate else candidate
else:
max_cat = max(counter, key=counter.get).lower()
return max_cat
def fetch_categories_from_list(difference, file):
parent_categories = []
for i, category in enumerate(difference):
if i % 3 == 0:
restart_phantomjs()
category_list = fetch_business_categories(category)
parent_categories.append(category_list)
print("{}".format(category))
with open(file, 'w') as file:
file.write(json.dumps(parent_categories))
import csv
def load_results(filename):
results = []
with open(filename, "r", encoding='utf-8') as f:
csvreader = csv.reader(f, delimiter=",")
for row in csvreader:
results.append(row)
return results
def get_facebook_category(biz_name):
results = google.search("{} facebook".format(biz_name))
if not results: return []
for result in results:
if result.link and 'facebook.com' in result.link:
link = result.link
link = link.replace('www','touch')
# print(link)
response = requests.get(link)
soup = BeautifulSoup(response.text)
try:
return [span.get_text() for span in soup.select('._59k._2rgt._1j-f._2rgt ._1j-g span')]
except:
# return category, [soup.select('div._59k._2rgt._1j-f._2rgt')[1].get_text()]
return []
return []
def get_matching_categories(google_mapping, partner_mapping):
matches = []
max_length = 4
for g_key, g_value in google_mapping.items():
g_value_set = set(g_value)
category_matches = []
for p_key, p_value in partner_mapping.items():
length = len(g_value_set.intersection(set(p_value)))
if length >= max_length:
category_matches.append(p_key)
if len(category_matches):
matches.append((g_key, category_matches))
return pd.DataFrame(matches, columns=columns)
def get_synonym_matches(difference, partner_categories_1):
matches_list = []
for category in difference:
if len(category.split()) == 1:
synonyms = get_synonyms(category)
partner_matches = []
for synonym in synonyms:
matches = difflib.get_close_matches(synonym, partner_categories_1, n=3, cutoff=0.96)
if len(matches):
# print(category, "--", synonym, matches)
partner_matches += matches
if len(partner_matches):
matches_list.append((category, partner_matches))
return pd.DataFrame(matches_list, columns=columns)
def get_facebook_df(facebook_mapping):
facebook_dict = []
for category_dict in facebook_mapping:
for key, value in category_dict.items():
if not value: continue
counter = Counter([v.lower() for v in value])
max_cat = max(counter, key=counter.get).lower()
facebook_dict.append((key.lower(), max_cat.lower()))
return pd.DataFrame(facebook_dict, columns = ['google_categories', 'facebook_categories'])
def invalid(value):
return not bool(pattern.match(value))
def get_most_occuring_categories(category_map):
category, category_list = category_map
if not category_list:
return ''
counter = Counter(category_list)
most_common = list(filter(lambda item: item[1] > 2, counter.items()))
return list(map(lambda x: x[0], sorted(most_common, key = lambda x: x[1], reverse=True)))