-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex.py
124 lines (111 loc) · 5.5 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# -*- coding: utf-8 -*-
import csv
import json
import time
import requests
from bs4 import BeautifulSoup
request_headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh-Hans;q=0.9",
"Connection": "keep-alive",
"Host": "www.xiaohongshu.com",
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 16_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3 Mobile/15E148 Safari/604.1"
}
# 失败重试机制, 重试次数为参数
def retry(func, retry_times=3):
def wrapper(*args, **kwargs):
for i in range(retry_times):
try:
result = func(*args, **kwargs)
return result
except Exception as e:
print(f'第{i+1}次请求失败, {e}')
time.sleep(1)
return wrapper
def get_note_details(node_id):
noteUrl = f'https://www.xiaohongshu.com/explore/{node_id}'
response = requests.get(noteUrl, headers=request_headers, timeout=10)
response.encoding = response.apparent_encoding
html = response.text
soup = BeautifulSoup(html, 'lxml')
json_str = soup.find(attrs={'type': 'application/ld+json'}).text
# 去除换行 等字符 不然 json无法解析 ,Windows 正常 ,Linux 异常
json_str = json_str.replace('\n', '').replace('\r\n', '')
result = json.loads(json_str, strict=False)
if result['name'] != '':
return result
else:
raise Exception('获取详情失败:' + str(response))
def get_page(page_id, cursor='', sort_by='hot'):
# 请求列表
url = 'https://www.xiaohongshu.com/web_api/sns/v3/page/notes'
params = {
'page_size': 20,
'sort': sort_by,
'page_id': page_id,
'cursor': cursor,
'sid': ''
}
headers = {
'authority': 'www.xiaohongshu.com',
'accept': 'application/json, text/plain, */*',
'accept-language': 'zh,en;q=0.9',
'cache-control': 'no-cache',
'cookie': 'smidV2=202204232153470815582356eb14689cc3de5c1e9509c700d854701c3e3a830; gid.sig=J51gqQITVoxM93_so_lXzHqIjeEsRkRzwnpCG-HeHF4; gid.sign.sig=c3QsgEsA_7IIMWY5_glVUzb3NYboY2AOn4vAAIAyu28; gid.ss=gSMQ9UOnDuZwH2oRGJG6BW6e4grs67TaYpnrW+8Wmd2azBbPYqKXIdsuljVz7UBg; timestamp2=1673265619526e4926c09daf76250bf9aa1ed5c6196f6faa2de07c82866f657; timestamp2.sig=O7AvEb24cf7yh5PwyqhN_au9q62nBL_3BNgT9Ff3504; xhsTrackerId=2fc58f61-b4e6-4736-b3c3-696fcbe0ef62; xhsTrackerId.sig=H52BZu6eM9xIkZTwMYc8r8Jp282ITn1Oa7Y2c41H2B8; a1=18612f30083g3ag1rrkgq76869imyxeub8hc88kz730000246697; webId=14e17cb02fbef4963ab87952fd4e3b45; gid=yYKyJiq8JiD0yYKyJiq88YYTYqkq0kyFFTkAxuWWKYKj3Sq86vCd7Y888J4KKjW8iKjD242j; gid.sign=oB2LoOOCblZWcXwhc2Vs2OyJQNU=; xhsTracker=url=explore&searchengine=google; xhsTracker.sig=MRMGzEloSNQs_DaToApEc',
'pragma': 'no-cache',
'referer': 'https://www.xiaohongshu.com/explore',
'sec-ch-ua': '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36'
}
response = requests.get(url, params=params, headers=headers)
try:
response.raise_for_status()
notes = response.json()['data']['notes']
next_cursor = response.json()['data']['cursor']
has_more = response.json()['data']['has_more']
return {'notes': notes, 'next_cursor': next_cursor, 'has_more': has_more}
except:
print(response)
raise Exception('请求接口失败')
def get_notes(page_id, file_name, min_size=10000, sort_by='hot'):
# 获取笔记列表
has_more = True
next_cursor = ''
success_count = 0
failed_count = 0
page_count = 0
with open(f'downloads/{file_name}.csv', 'w', newline='', encoding='utf-8-sig') as csvfile:
fieldnames = ['id', 'title', 'content', 'likes', 'collects', 'date']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
while has_more and success_count < min_size:
page_count += 1
list_result = retry(get_page, 3)(page_id, next_cursor, sort_by)
notes = list_result['notes']
next_cursor = list_result['next_cursor']
if next_cursor is not None:
has_more = list_result['has_more']
print(f'第{page_count}页')
for note in notes:
if note['type'] != 'video':
detail = retry(get_note_details, 3)(note['id'])
if detail is not None:
success_count += 1
title = detail['name']
content = detail['description']
date = detail['datePublished']
likes = note['likes']
collects = note['collects']
# 存储至 CSV 文件中
writer.writerow({'id': note['id'], 'title': title, 'content': content, 'likes': likes, 'collects': collects, 'date': date})
print(f'成功{success_count}条')
else:
failed_count += 1
print('爬取失败: ' + note['id'])
print(f'爬取完成,共{success_count}条,失败{failed_count}条')
get_notes('62b20313e35fd70001011700', 'AIGC_hot', 1000, 'hot')