-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawl.py
183 lines (150 loc) · 7.19 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#-*- coding: utf8 -*-
import io
import os
import sys
import time
import requests
from bs4 import BeautifulSoup
import json
import subprocess
class HouseCrawler(object):
def __init__(self):
requests.adapters.DEFAULT_RETRIES = 5
sess = requests.session()
sess.keep_alive = False
self.use_proxy = False
self.proxy_proc = None
self.url_template = "https://{}.lianjia.com/ershoufang/{}/pg{}/" # (id, region, page)
self.house_set = set()
self.load_config()
def load_config(self):
cfg_file = os.path.split(os.path.realpath(__file__))[0] + "\\config.json"
with open(cfg_file, mode='r', encoding='utf-8') as file_obj:
content = file_obj.read()
self.cfg = json.loads(content)
def crawl(self):
today = time.strftime("%Y-%m-%d", time.localtime())
for city in self.cfg["cities"]:
city_house_array = []
city_name = city["name"]
city_id = city["id"]
if city_name == "" or city_id == "":
continue
self.house_set = set()
for region in city["regions"]:
city_house_array.extend(self.crawl_region(city_id, region))
house_info = {}
house_info["date"] = today
house_info["houses"] = city_house_array
print("Writing to file...")
house_file = os.path.split(os.path.realpath(__file__))[0] + "\\houses\\" + city_name + "\\" + today + ".json"
self.write_file(house_file, house_info)
print("Sleeping for a long time...")
time.sleep(60)
# 如果有代理,关闭
if self.proxy_proc:
self.proxy_proc.terminate()
self.proxy_proc.wait()
def _parse_basic_info(self, text):
basic_list = list(filter(None, text.split("|")))
layout = basic_list[0].strip()
area = basic_list[1].strip().replace("平米", "")
return [layout, area]
def crawl_region(self, city_id, region):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/49.0', 'Cookie': self.cfg["cookie"]}
proxies = {
'http': 'http://127.0.0.1:7890',
'https': 'https://127.0.0.1:7890'
}
house_array = []
page = 1
max_page = 100
max_page_found = False
slow_count = 0
while True:
if page > max_page:
break
page_url = self.url_template.format(city_id, region, page)
print(page_url)
try:
time1 = time.time()
response = requests.get(page_url, headers=headers, proxies=proxies if self.use_proxy else None, timeout=8)
time2 = time.time()
# 检查请求时间,如果太长(一般10秒左右)说明爬虫被网站限制了,启动clash代理
if (not self.use_proxy) and (time2 - time1) > 3:
slow_count += 1
if slow_count > 2:
print("Too slow, prepare to open clash proxy...")
self.proxy_proc = subprocess.Popen([r"D:\Program Files\Clash for Windows\Clash for Windows.exe"])
self.use_proxy = True
time.sleep(15)
page_content = response.content.decode("utf-8", "ignore")
#page_file = os.path.split(os.path.realpath(__file__))[0] + "\\{}.html".format(page)
#with open(page_file, mode='w', encoding='utf_8_sig') as file_obj:
# file_obj.write(page_content)
except Exception as e:
print("exception:" + str(e))
page += 1
time.sleep(5)
continue
page_soup = BeautifulSoup(page_content, "html.parser")
try:
# find max page
if not max_page_found:
page_data = page_soup.find("div", "page-box house-lst-page-box")["page-data"]
max_page = int(json.loads(page_data)["totalPage"])
print("max page:", max_page)
max_page_found = True
ul_elt = page_soup.find("ul", "sellListContent")
print("house count:", len(ul_elt))
for house_elt in ul_elt:
house_url = house_elt.find("a", "noresultRecommend img LOGCLICKDATA")["href"]
if house_url in self.house_set:
continue
try:
self.house_set.add(house_url)
title = house_elt.find("div", "title").find("a").get_text()
address = house_elt.find("div", "flood").get_text().replace(" ", "")
basic_info = house_elt.find("div", "houseInfo").get_text()
[layout, area] = self._parse_basic_info(basic_info)
total_price = house_elt.find("div", "totalPrice totalPrice2").get_text().replace("万", "").replace("参考价:", "").strip()
average_price = house_elt.find("div", "unitPrice").get_text().replace("元/平", "").replace(",", "")
house_dict = {}
house_dict["u"] = house_url
house_dict["t"] = title
house_dict["l"] = layout
house_dict["ar"] = float(area)
house_dict["ad"] = address
house_dict["r"] = region
house_dict["tp"] = float(total_price)
house_dict["ap"] = float(average_price)
house_array.append(house_dict)
except Exception as e:
print("exception:" + str(e))
print("Sleeping for a while...")
time.sleep(float(self.cfg["page_interval"]))
if page % int(self.cfg["pages_of_group"]) == 0:
print("Sleeping more...")
time.sleep(self.cfg["group_interval"])
except Exception as ex: # 有时若干密集请求后,链家会返回无结果的页面,一般重试一下就成功了
print("exception:" + str(ex))
print("There's something error, it will retry after a few seconds...")
time.sleep(1.2)
continue
page += 1
return house_array
def write_file(self, file_name, house_info):
dir = os.path.dirname(file_name)
if not os.path.exists(dir):
os.mkdir(dir)
content = json.dumps(house_info, ensure_ascii=False)
with open(file_name, mode='w', encoding='utf-8') as file_obj:
file_obj.write(content)
if __name__ == '__main__':
crawler = HouseCrawler()
begin = time.time()
crawler.crawl()
end = time.time()
m, s = divmod(int(end-begin), 60)
h, m = divmod(m, 60)
print("Elapsed: {} hours {} minutes {} seconds".format(h, m, s))