Skip to content

Commit

Permalink
Merge pull request #538 from 2513502304/main
Browse files Browse the repository at this point in the history
feat: bilibli support date range filter
  • Loading branch information
NanmiCoder authored Jan 17, 2025
2 parents 30d0e73 + 2d93ec5 commit 4b63ea6
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 45 deletions.
14 changes: 12 additions & 2 deletions config/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,17 @@
# 爬取视频/帖子的数量控制
CRAWLER_MAX_NOTES_COUNT = 200

# 爬取开始的天数,仅支持 bilibili 关键字搜索,YYYY-MM-DD 格式,若为 None 则表示不设置时间范围,按照默认关键字最多返回 1000 条视频的结果处理
START_DAY = '2024-01-01'

# 爬取结束的天数,仅支持 bilibili 关键字搜索,YYYY-MM-DD 格式,若为 None 则表示不设置时间范围,按照默认关键字最多返回 1000 条视频的结果处理
END_DAY = '2024-01-01'

# 是否开启按每一天进行爬取的选项,仅支持 bilibili 关键字搜索
# 若为 False,则忽略 START_DAY 与 END_DAY 设置的值
# 若为 True,则按照 START_DAY 至 END_DAY 按照每一天进行筛选,这样能够突破 1000 条视频的限制,最大程度爬取该关键词下的所有视频
ALL_DAY = True

# 并发爬虫数量控制
MAX_CONCURRENCY_NUM = 1

Expand All @@ -69,7 +80,6 @@
# 爬取一级评论的数量控制(单视频/帖子)
CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 10


# 是否开启爬二级评论模式, 默认不开启爬二级评论
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
ENABLE_GET_SUB_COMMENTS = False
Expand All @@ -87,7 +97,6 @@
# ........................
]


# 指定抖音需要爬取的ID列表
DY_SPECIFIED_ID_LIST = [
"7280854932641664319",
Expand Down Expand Up @@ -126,6 +135,7 @@
# "盗墓笔记"
]

# 指定贴吧创作者URL列表
TIEBA_CREATOR_URL_LIST = [
"https://tieba.baidu.com/home/main/?id=tb.1.7f139e2e.6CyEwxu3VJruH_-QqpCi6g&fr=frs",
# ........................
Expand Down
4 changes: 2 additions & 2 deletions media_platform/bilibili/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,8 +147,8 @@ async def search_video_by_keyword(self, keyword: str, page: int = 1, page_size:
"page": page,
"page_size": page_size,
"order": order.value,
"pubtime_begin": pubtime_begin_s,
"pubtime_end": pubtime_end_s
"pubtime_begin_s": pubtime_begin_s,
"pubtime_end_s": pubtime_end_s
}
return await self.get(uri, post_data)

Expand Down
147 changes: 107 additions & 40 deletions media_platform/bilibili/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@
import random
from asyncio import Task
from typing import Dict, List, Optional, Tuple, Union
from datetime import datetime, timedelta
import pandas as pd

from playwright.async_api import (BrowserContext, BrowserType, Page,
async_playwright)
from playwright.async_api import (BrowserContext, BrowserType, Page, async_playwright)

import config
from base.base_crawler import AbstractCrawler
Expand Down Expand Up @@ -95,56 +96,122 @@ async def start(self):
utils.logger.info(
"[BilibiliCrawler.start] Bilibili Crawler finished ...")

async def get_pubtime_datetime(self, start: str = config.START_DAY, end: str = config.END_DAY) -> tuple[str, str]:
"""
获取 bilibili 作品发布日期起始时间戳 pubtime_begin_s 与发布日期结束时间戳 pubtime_end_s
---
:param start: 发布日期起始时间,YYYY-MM-DD
:param end: 发布日期结束时间,YYYY-MM-DD
Note
---
- 搜索的时间范围为 start 至 end,包含 start 和 end
- 若要搜索同一天的内容,为了包含 start 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_begin_s 的值加上一天再减去一秒,即 start 当天的最后一秒
- 如仅搜索 2024-01-05 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704470399
转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 5, 23, 59, 59)
- 若要搜索 start 至 end 的内容,为了包含 end 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_end_s 的值加上一天再减去一秒,即 end 当天的最后一秒
- 如搜索 2024-01-05 - 2024-01-06 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704556799
转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 6, 23, 59, 59)
"""
# 转换 start 与 end 为 datetime 对象
start_day: datetime = datetime.strptime(start, '%Y-%m-%d')
end_day: datetime = datetime.strptime(end, '%Y-%m-%d')
if start_day > end_day:
raise ValueError('Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
elif start_day == end_day: # 搜索同一天的内容
end_day = start_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 start_day + 1 day - 1 second
else: # 搜索 start 至 end
end_day = end_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 end_day + 1 day - 1 second
# 将其重新转换为时间戳
return str(int(start_day.timestamp())), str(int(end_day.timestamp()))

async def search(self):
"""
search bilibili video with keywords
:return:
"""
utils.logger.info(
"[BilibiliCrawler.search] Begin search bilibli keywords")
utils.logger.info("[BilibiliCrawler.search] Begin search bilibli keywords")
bili_limit_count = 20 # bilibili limit page fixed value
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
start_page = config.START_PAGE # start page number
for keyword in config.KEYWORDS.split(","):
source_keyword_var.set(keyword)
utils.logger.info(
f"[BilibiliCrawler.search] Current search keyword: {keyword}")
page = 1
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page:
utils.logger.info(
f"[BilibiliCrawler.search] Skip page: {page}")
utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}")
# 每个关键词最多返回 1000 条数据
if not config.ALL_DAY:
page = 1
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
if page < start_page:
utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
page += 1
continue

utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, page: {page}")
video_id_list: List[str] = []
videos_res = await self.bili_client.search_video_by_keyword(
keyword=keyword,
page=page,
page_size=bili_limit_count,
order=SearchOrderType.DEFAULT,
pubtime_begin_s=0, # 作品发布日期起始时间戳
pubtime_end_s=0 # 作品发布日期结束日期时间戳
)
video_list: List[Dict] = videos_res.get("result")

semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
video_items = await asyncio.gather(*task_list)
for video_item in video_items:
if video_item:
video_id_list.append(video_item.get("View").get("aid"))
await bilibili_store.update_bilibili_video(video_item)
await bilibili_store.update_up_info(video_item)
await self.get_bilibili_video(video_item, semaphore)
page += 1
continue

utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, page: {page}")
video_id_list: List[str] = []
videos_res = await self.bili_client.search_video_by_keyword(
keyword=keyword,
page=page,
page_size=bili_limit_count,
order=SearchOrderType.DEFAULT,
pubtime_begin_s=0, # 作品发布日期起始时间戳
pubtime_end_s=0 # 作品发布日期结束日期时间戳
)
video_list: List[Dict] = videos_res.get("result")

semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_video_info_task(aid=video_item.get(
"aid"), bvid="", semaphore=semaphore)
for video_item in video_list
]
video_items = await asyncio.gather(*task_list)
for video_item in video_items:
if video_item:
video_id_list.append(video_item.get("View").get("aid"))
await bilibili_store.update_bilibili_video(video_item)
await bilibili_store.update_up_info(video_item)
await self.get_bilibili_video(video_item, semaphore)
page += 1
await self.batch_get_video_comments(video_id_list)
await self.batch_get_video_comments(video_id_list)
# 按照 START_DAY 至 END_DAY 按照每一天进行筛选,这样能够突破 1000 条视频的限制,最大程度爬取该关键词下的所有视频
else:
for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
# 按照每一天进行爬取的时间戳参数
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), end=day.strftime('%Y-%m-%d'))
page = 1
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
# ! Catch any error if response return nothing, go to next day
try:
# ! Don't skip any page, to make sure gather all video in one day
# if page < start_page:
# utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
# page += 1
# continue

utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
video_id_list: List[str] = []
videos_res = await self.bili_client.search_video_by_keyword(
keyword=keyword,
page=page,
page_size=bili_limit_count,
order=SearchOrderType.DEFAULT,
pubtime_begin_s=pubtime_begin_s, # 作品发布日期起始时间戳
pubtime_end_s=pubtime_end_s # 作品发布日期结束日期时间戳
)
video_list: List[Dict] = videos_res.get("result")

semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
video_items = await asyncio.gather(*task_list)
for video_item in video_items:
if video_item:
video_id_list.append(video_item.get("View").get("aid"))
await bilibili_store.update_bilibili_video(video_item)
await bilibili_store.update_up_info(video_item)
await self.get_bilibili_video(video_item, semaphore)
page += 1
await self.batch_get_video_comments(video_id_list)
# go to next day
except Exception as e:
print(e)
break

async def batch_get_video_comments(self, video_id_list: List[str]):
"""
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ wordcloud==1.9.3
matplotlib==3.9.0
requests==2.32.3
parsel==1.9.1
pyexecjs==1.5.1
pyexecjs==1.5.1
pandas==2.2.3

0 comments on commit 4b63ea6

Please sign in to comment.