-
Notifications
You must be signed in to change notification settings - Fork 2
/
WeiboSpider_Keyword.py
389 lines (333 loc) · 13.8 KB
/
WeiboSpider_Keyword.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
# 微博数据抓取 基于关键词
# 作者: David
# Github: https://github.com/HEUDavid/WeiboSpider
import datetime
import random
import re
import time
import pandas as pd
from bs4 import BeautifulSoup
from CookieTest import CookieTest
from SpiderThread import SpiderThread
class WeiboSpider:
def __init__(self, html):
self.html = html
def get_user_config(self, url):
path = './cookies/cookie_965019007@qq.com.json'
session_obj = CookieTest(path)
user_home = session_obj.get_page(url)
s = user_home.index("CONFIG['domain']") # 所有用户信息待完善
user_domain = int(user_home[s+18:s+24:])
return user_domain
# return CONFIG
def get_results(self):
'''
将每一页的内容抽取出来
'''
res = []
soup = BeautifulSoup(self.html, 'html.parser')
if len(soup.select('.card-no-result')) > 0:
print('这一页没有内容')
return res
else:
for i in soup.select('div[action-type="feed_list_item"]'):
try:
blog = {}
blog['博主昵称'] = i.select('.name')[0].get('nick-name')
blog['博主主页'] = 'https:' + i.select('.name')[0].get('href')
'''
判断用户属性
'''
user_domain = self.get_user_config(blog['博主主页'])
if user_domain == 100505:
# print('个人')
blog['用户属性'] = '1'
elif user_domain == 100106:
# print('政府')
blog['用户属性'] = '2'
else:
blog['用户属性'] = '3'
weibotext = ''
# if i.select('div[class="content"] p[node-type="feed_list_content_full"]'):
# weibotext = i.select('div[class="content"] p[node-type="feed_list_content_full"]')[
# 0].text.replace('收起全文d', '').strip()
# if weibotext == '':
# if i.select('div[class="content"] p[node-type="feed_list_content"]'):
# weibotext = i.select(
# 'div[class="content"] p[node-type="feed_list_content"]')[0].text.strip()
if i.select('div[class="content"] p[node-type="feed_list_content_full"]'):
content = i.select(
'div[class="content"] p[node-type="feed_list_content_full"]')[0]
for item in content:
if item.name == 'img' and item.attrs['class'] == ['face']:
weibotext += item.attrs['title'] # 微博表情
elif item.string:
weibotext += item.string.strip()
if not weibotext:
if i.select('div[class="content"] p[node-type="feed_list_content"]'):
content = i.select(
'div[class="content"] p[node-type="feed_list_content"]')[0]
for item in content:
if item.name == 'img' and item.attrs['class'] == ['face']:
weibotext += item.attrs['title'] # 微博表情
elif item.string:
weibotext += item.string.strip()
if weibotext:
blog['微博内容'] = weibotext.replace('\r', '')
else:
# print('这一条微博没有内容')
continue
blog['发布时间'] = self.get_datetime(
i.select('div[class="content"] p[class="from"] a')[0].get_text().strip())
blog['微博地址'] = 'https:' + \
i.select('div[class="content"] p[class="from"] a')[
0].get('href')
try:
blog['微博来源'] = i.select('div[class="content"] p[class="from"] a')[
1].get_text()
except BaseException:
blog['微博来源'] = ''
try:
sd = i.select('.card-act ul li')
except Exception:
print('sd not found...')
print('Something wrong with')
try:
blog['转发'] = 0 if sd[1].text.replace(
'转发', '').strip() == '' else int(
sd[1].text.replace(
'转发', '').strip())
except BaseException:
blog['转发'] = 0
try:
blog['评论'] = 0 if sd[2].text.replace(
'评论', '').strip() == '' else int(
sd[2].text.replace(
'评论', '').strip())
except BaseException:
blog['评论'] = 0
try:
blog['赞'] = 0 if sd[3].select('em')[0].get_text(
) == '' else int(sd[3].select('em')[0].get_text())
except BaseException:
blog['赞'] = 0
res.append(blog)
except BaseException:
print('忽略:', i.get_text().replace('\n', '').strip())
return res
def get_datetime(self, s):
'''
格式化时间
'''
try:
today = datetime.datetime.today()
if '今天' in s:
H, M = re.findall(r'\d+', s)
date = datetime.datetime(
today.year,
today.month,
today.day,
int(H),
int(M)).strftime('%Y-%m-%d %H:%M')
elif '分' in s:
M = re.findall(r'\d+', s)[0]
date = datetime.datetime(
today.year,
today.month,
today.day,
today.hour,
int(M)).strftime('%Y-%m-%d %H:%M')
elif '秒' in s:
date = datetime.datetime(
today.year,
today.month,
today.day,
today.hour,
today.minute).strftime('%Y-%m-%d %H:%M')
elif '年' in s:
y, m, d, H, M = re.findall(r'\d+', s)
date = datetime.datetime(
int(y),
int(m),
int(d),
int(H),
int(M)).strftime('%Y-%m-%d %H:%M')
else:
# 当年数据
m, d, H, M = re.findall(r'\d+', s)
date = datetime.datetime(
today.year,
int(m),
int(d),
int(H),
int(M)).strftime('%Y-%m-%d %H:%M')
except BaseException:
print('时间格式异常', s)
date = s
return date
class Search:
'''
查询参数
q: 刘强东
region: custom:11:1000
scope: ori
suball: 1
timescope: custom:2019-04-01:2019-04-02
Refer: g
page: 2
'''
def __init__(self, keyword, timescope, region, type='scope=ori', include='suball=1'):
self.keyword = keyword
self.timescope = timescope
self.region = region # 地点 省 市
self.type = type # 类型 全部 热门 原创scope=ori 关注人 认证用户 媒体 观点
self.include = include # 包含 含图片 含视频 含音乐 含短链
self.totalPage = 0
def get_url(self, page):
'''
拼接一个搜索 url
'''
url = f'https://s.weibo.com/weibo?q={self.keyword}®ion={self.region}&{self.type}&{self.include}×cope={self.timescope}&Refer=g&page={page}'
return url
def get_totalPage(self, html):
'''
通过分析第一页, 得到总共的页面数.
'''
soup = BeautifulSoup(html, 'html.parser')
if len(soup.select('.card-no-result')) > 0:
totalpage = 0
elif soup.select('.s-scroll li'):
totalpage = len(soup.select('.s-scroll li'))
else:
totalpage = 1 # 没有翻页, 只有一页
self.totalPage = totalpage
return totalpage
def search():
'''
每一个时间段都生成一个搜索实例, 返回一个列表
'''
search = []
keyword = '刘强东' # 搜索关键字
prov = '0' # 省市代码见图片 prov.png
city = '1000' # 不限城市
region = f'custom:{prov}:{city}'
startTime = '2018-01-01-0'
endTime = '2019-01-01-0'
start_date = datetime.datetime.strptime(startTime, '%Y-%m-%d-%H')
end_date = datetime.datetime.strptime(endTime, '%Y-%m-%d-%H')
period_length = 200 # 时间段的长度控制查询精度
start_temp = start_date
end_temp = start_temp + datetime.timedelta(days=period_length)
while end_temp <= end_date:
startTime = start_temp.strftime('%Y-%m-%d-%H')
endTime = end_temp.strftime('%Y-%m-%d-%H')
timescope = f'custom:{startTime}:{endTime}'
search.append(Search(keyword, timescope, region))
start_temp = end_temp
end_temp = start_temp + datetime.timedelta(days=period_length)
return search
def get_data(search_obj, session_obj, start, end):
'''
爬 [start, end] 页数据
'''
print(f'抓取 {start} 到 {end} 页')
data = []
for i in range(start, end + 1):
# print('#' * 10 + f'第 {i} 页' + '#' * 10)
url = search_obj.get_url(i)
try:
html = session_obj.get_page(url)
spider = WeiboSpider(html)
results = spider.get_results()
if results:
data.extend(results)
else:
print(f'\033[0;0;31m{url}\033[0m')
time.sleep(random.randint(1, 3))
except BaseException:
print(f'出错重试: {url}')
time.sleep(5)
html = session_obj.get_page(url)
spider = WeiboSpider(html)
results = spider.get_results()
if results:
data.extend(results)
else:
print(f'\033[0;37;41m{url}\033[0m')
time.sleep(random.randint(1, 3))
print(f'第 {i} 页抓取结束')
print(f'\033[0;30;47m抓取 {start} 到 {end} 页结束\033[0m')
return data
def main():
session_obj1 = CookieTest('./cookies/cookie_18846426742.json') # cookie 路径
session_obj2 = CookieTest('./cookies/cookie_admin@mdavid.cn.json')
session_obj3 = CookieTest('./cookies/cookie_854107424@qq.com.json')
session_obj4 = CookieTest('./cookies/cookie_965019007@qq.com.json')
html1 = session_obj1.get_page()
html2 = session_obj2.get_page()
html3 = session_obj3.get_page()
html4 = session_obj4.get_page()
if not session_obj1.is_OK(html1):
print('cookie_18846426742.json 失效')
return None
if not session_obj2.is_OK(html2):
print('cookie_admin@mdavid.cn.json 失效')
return None
if not session_obj3.is_OK(html3):
print('cookie_854107424@qq.com.json 失效')
return None
if not session_obj4.is_OK(html4):
print('cookie_965019007@qq.com.json 失效')
return None
searchList = search() # 就是时间范围不一样
count = 0
savePath = './data/' + searchList[0].keyword + '.csv'
for search_obj in searchList:
url = search_obj.get_url(1)
html = session_obj1.get_page(url)
totalPage = search_obj.get_totalPage(html)
if totalPage == 0:
print(f'\033[0;0;43m{search_obj.timescope[7:]} 没有微博\033[0m')
continue
print(f'\033[0;0;33m{search_obj.timescope[7:]} 有 {totalPage} 页\033[0m')
starttime = time.time()
# 单线程
# data = get_data(search_obj, session_obj1, 1, totalPage)
# 多线程
if totalPage < 12.5:
data = get_data(search_obj, session_obj1, 1, totalPage)
else:
totalPage = 20 # demo 展示
step = int(totalPage / 4)
t1 = SpiderThread(get_data, args=(
search_obj, session_obj1, 1, 1 + step))
t2 = SpiderThread(get_data, args=(
search_obj, session_obj2, 2 + step, 2 + step * 2))
t3 = SpiderThread(get_data, args=(
search_obj, session_obj3, 3 + step * 2, 3 + step * 3))
t4 = SpiderThread(get_data, args=(
search_obj, session_obj4, 4 + step * 3, totalPage))
t1.start()
t2.start()
t3.start()
t4.start()
t1.join()
t2.join()
t3.join()
t4.join()
data = t1.get_result() + t2.get_result() + t3.get_result() + t4.get_result()
endtime = time.time()
totaltime = endtime - starttime # 一个时间段内, 爬虫执行耗时
print(f'\033[0;0;32m采集到 {len(data)} 条微博,\033[0m',
'耗时: {0:.5f} 秒'.format(totaltime))
count += len(data)
df = pd.DataFrame(data)
df.to_csv(savePath, mode='a', header=None, index=None) # 追加模式
# header: 博主主页,博主昵称,发布时间,微博内容,微博地址,微博来源,评论,赞,转发
print(f'\033[1;0;31m共采集 {count} 条微博\033[0m')
print(savePath, '保存成功')
time1 = time.time()
main()
time2 = time.time()
all_time = time2 - time1
print('耗时: {0:.2f} 秒'.format(all_time))