-
Notifications
You must be signed in to change notification settings - Fork 30
/
zhihu.py
97 lines (83 loc) · 3.66 KB
/
zhihu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2016-11-19 19:50:10
# Project: zhihu
from pyspider.libs.base_handler import *
import random
import MySQLdb
class Handler(BaseHandler):
crawl_config = {
'itag': 'v1',
'headers': {
'User-Agent': 'GoogleBot',
'Host': 'www.zhihu.com',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
}
}
# 连接数据库
def __init__(self):
self.db = MySQLdb.connect('localhost', 'root', 'root', 'wenda', charset='utf8')
# 插入问题
def add_question(self, title, content, comment_count):
try:
cursor = self.db.cursor()
sql = 'insert into question(title, content, user_id, created_date, comment_count) values ("%s","%s",%d, %s, %d)' % (
title, content, random.randint(1, 10), 'now()', comment_count);
# print sql
cursor.execute(sql)
qid = cursor.lastrowid # qid是问题的ID
self.db.commit()
return qid
except Exception, e:
print
e
self.db.rollback()
# 插入评论
def add_comment(self, qid, comment): # 根据问题的ID,然后插入对应的评论
try:
cursor = self.db.cursor()
sql = 'insert into comment(content, user_id, entity_id, entity_type, created_date) values ("%s","%d",%d, %d, %s)' % (
comment, random.randint(1, 10), 1, qid, 1, 'now()');
# print sql
cursor.execute(sql)
self.db.commit()
except Exception, e:
print
e
self.db.rollback()
return 0
# 话题精华页
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://www.zhihu.com/topic/19550517/top-answers?page=1', callback=self.index_page,
validate_cert=False) # 互联网话题精华回答
# self.crawl('https://www.zhihu.com/topic/19552330/top-answers?page=1', callback=self.index_page, validate_cert=False)#程序员话题精华回答
# 通过question_link找到问题的详情页
# 通过.zm-invite-pager span a实现翻页
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('a.question_link]').items():
self.crawl(each.attr.href, callback=self.detail_page, validate_cert=False)
for each in response.doc('.zm-invite-pager span a').items():
self.crawl(each.attr.href, callback=self.index_page,
validate_cert=False) # 找到.zm-invite-pager span a后,继续循环index_page,从而实现翻页把该话题中所有的精华问题都爬取到
@config(priority=2)
def detail_page(self, response):
items = response.doc('div.zm-editable-content clearfix').items # items 问题的评论
title = response.doc('span.zm-editable-content').text() # title 问题的题目
html = response.doc('#zh-question-detail .zm-editable-content').html() # html 问题的补充描述
# 因为有些问题的下面没有补充说明,所以我们要进行判断html是否为空
if html == None:
html = ''
content = html.replace('"', '\\"')
print
content
# 评论下一页是通过Ajax加载的,后期在来弄
qid = self.add_question(title, content, sum(1 for x in items))
for each in response.doc('div.zm-editable-content clearfix').items:
self.add_comment(qid, each.html.replace('"', '\\"'))
return {
"url": response.url,
"title": title,
"content": content,
}