diff --git a/README.md b/README.md index bed5808b..1f57d883 100644 --- a/README.md +++ b/README.md @@ -180,32 +180,6 @@ python run_spider.py follow ... ``` -### 用户的微博采集 - -```bash -python run_spider.py tweet -``` - -```json -{ - "crawl_time": 1666864583, - "_id": "4762810834227120", - "mblogid": "LqlZNhJFm", - "created_at": "2022-04-27 10:20:54", - "geo": null, - "ip_location": null, - "reposts_count": 1907, - "comments_count": 1924, - "attitudes_count": 12169, - "source": "三星Galaxy S22 Ultra", - "content": "生于乱世纵横四海,义之所在不计生死,孤勇者陈恭一生当如是。#风起陇西今日开播# #风起陇西# 今晚,恭候你!", - "pic_urls": [], - "pic_num": 0, - "video": "http://f.video.weibocdn.com/o0/CmQEWK1ylx07VAm0nrxe01041200YDIc0E010.mp4?label=mp4_720p&template=1280x720.25.0&ori=0&ps=1CwnkDw1GXwCQx&Expires=1666868183&ssig=RlIeOt286i&KID=unistore,video", - "url": "https://weibo.com/1087770692/LqlZNhJFm" -} -... -``` ### 微博评论采集 @@ -273,10 +247,75 @@ python run_spider.py repost ... ``` +### 基于推文ID的推文采集 + +```bash +python run_spider.py tweet_by_tweet_id +``` + +```json +{ + "_id": "4762810834227120", + "mblogid": "LqlZNhJFm", + "created_at": "2022-04-27 10:20:54", + "geo": null, + "ip_location": null, + "reposts_count": 1890, + "comments_count": 1924, + "attitudes_count": 12167, + "source": "三星Galaxy S22 Ultra", + "content": "生于乱世纵横四海,义之所在不计生死,孤勇者陈恭一生当如是。#风起陇西今日开播# #风起陇西# 今晚,恭候你!", + "pic_urls": [], + "pic_num": 0, + "isLongText": false, + "user": { + "_id": "1087770692", + "avatar_hd": "https://tvax1.sinaimg.cn/crop.0.0.1080.1080.1024/40d61044ly8gbhxwgy419j20u00u0goc.jpg?KID=imgbed,tva&Expires=1682768013&ssig=r1QurGoc2L", + "nick_name": "陈坤", + "verified": true, + "mbrank": 7, + "mbtype": 12, + "verified_type": 0 + }, + "video": "http://f.video.weibocdn.com/o0/CmQEWK1ylx07VAm0nrxe01041200YDIc0E010.mp4?label=mp4_720p&template=1280x720.25.0&ori=0&ps=1CwnkDw1GXwCQx&Expires=1682760813&ssig=26udcPSXFJ&KID=unistore,video", + "url": "https://weibo.com/1087770692/LqlZNhJFm", + "crawl_time": 1682757213 +} +... +``` + +### 用户的微博采集 + +```bash +python run_spider.py tweet_by_user_id +``` + +```json +{ + "crawl_time": 1666864583, + "_id": "4762810834227120", + "mblogid": "LqlZNhJFm", + "created_at": "2022-04-27 10:20:54", + "geo": null, + "ip_location": null, + "reposts_count": 1907, + "comments_count": 1924, + "attitudes_count": 12169, + "source": "三星Galaxy S22 Ultra", + "content": "生于乱世纵横四海,义之所在不计生死,孤勇者陈恭一生当如是。#风起陇西今日开播# #风起陇西# 今晚,恭候你!", + "pic_urls": [], + "pic_num": 0, + "video": "http://f.video.weibocdn.com/o0/CmQEWK1ylx07VAm0nrxe01041200YDIc0E010.mp4?label=mp4_720p&template=1280x720.25.0&ori=0&ps=1CwnkDw1GXwCQx&Expires=1666868183&ssig=RlIeOt286i&KID=unistore,video", + "url": "https://weibo.com/1087770692/LqlZNhJFm" +} +... +``` + + ### 基于关键词的微博检索 ```bash -python run_spider.py search +python run_spider.py tweet_by_keyword ``` ```json @@ -333,8 +372,9 @@ python run_spider.py search ## 更新日志 +- 2023.04: 支持针对推文id的推文采集 [#272](https://github.com/nghuyong/WeiboSpider/issues/272) - 2022.11: 支持针对单个关键词获取单天超过1200页的检索结果 [#257](https://github.com/nghuyong/WeiboSpider/issues/257) -- 2022.11: 支持长微博全文的获取 +- 2022.11: 支持长微博全文的获取 [] - 2022.11: 基于关键词微博搜索支持指定时间范围 - 2022.10: 添加IP归属地信息的采集,包括用户数据,微博数据和微博评论数据 - 2022.10: 基于weibo.com站点对项目进行重构 diff --git a/weibospider/run_spider.py b/weibospider/run_spider.py index ddb5567f..e80ee40a 100644 --- a/weibospider/run_spider.py +++ b/weibospider/run_spider.py @@ -9,13 +9,14 @@ import sys from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings -from spiders.tweet import TweetSpider +from spiders.tweet_by_user_id import TweetSpiderByUserID +from spiders.tweet_by_keyword import TweetSpiderByKeyword +from spiders.tweet_by_tweet_id import TweetSpiderByTweetID from spiders.comment import CommentSpider from spiders.follower import FollowerSpider from spiders.user import UserSpider from spiders.fan import FanSpider from spiders.repost import RepostSpider -from spiders.search import SearchSpider if __name__ == '__main__': mode = sys.argv[1] @@ -26,10 +27,11 @@ 'comment': CommentSpider, 'fan': FanSpider, 'follow': FollowerSpider, - 'tweet': TweetSpider, 'user': UserSpider, 'repost': RepostSpider, - 'search': SearchSpider + 'tweet_by_tweet_id': TweetSpiderByTweetID, + 'tweet_by_user_id': TweetSpiderByUserID, + 'tweet_by_keyword': TweetSpiderByKeyword, } process.crawl(mode_to_spider[mode]) # the script will block here until the crawling is finished diff --git a/weibospider/spiders/common.py b/weibospider/spiders/common.py index 75e3629c..ffcea0a7 100644 --- a/weibospider/spiders/common.py +++ b/weibospider/spiders/common.py @@ -5,6 +5,7 @@ Created Time: 2022/10/24 """ import json +import re import dateutil.parser @@ -102,6 +103,8 @@ def parse_tweet_info(data): 'isLongText': False, "user": parse_user_info(data['user']), } + if '' in tweet['source']: + tweet['source'] = re.search(r'>(.*?)', tweet['source']).group(1) if 'page_info' in data and data['page_info'].get('object_type', '') == 'video': tweet['video'] = data['page_info']['media_info']['mp4_720p_mp4'] tweet['url'] = f"https://weibo.com/{tweet['user']['_id']}/{tweet['mblogid']}" diff --git a/weibospider/spiders/search.py b/weibospider/spiders/tweet_by_keyword.py similarity index 97% rename from weibospider/spiders/search.py rename to weibospider/spiders/tweet_by_keyword.py index 8f9b8e06..0adf1d0c 100644 --- a/weibospider/spiders/search.py +++ b/weibospider/spiders/tweet_by_keyword.py @@ -10,11 +10,11 @@ from spiders.common import parse_tweet_info, parse_long_tweet -class SearchSpider(Spider): +class TweetSpiderByKeyword(Spider): """ 关键词搜索采集 """ - name = "search_spider" + name = "tweet_spider_by_keyword" base_url = "https://s.weibo.com/" def start_requests(self): diff --git a/weibospider/spiders/tweet_by_tweet_id.py b/weibospider/spiders/tweet_by_tweet_id.py new file mode 100644 index 00000000..ee196b4a --- /dev/null +++ b/weibospider/spiders/tweet_by_tweet_id.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python +# encoding: utf-8 +""" +Author: nghuyong +Mail: nghuyong@163.com +Created Time: 2020/4/14 +""" +import json +from scrapy import Spider +from scrapy.http import Request +from spiders.common import parse_tweet_info, parse_long_tweet + + +class TweetSpiderByTweetID(Spider): + """ + 用户推文ID采集推文 + """ + name = "tweet_spider_by_tweet_id" + base_url = "https://weibo.cn" + + def start_requests(self): + """ + 爬虫入口 + """ + # 这里user_ids可替换成实际待采集的数据 + tweet_ids = ['LqlZNhJFm'] + for tweet_id in tweet_ids: + url = f"https://weibo.com/ajax/statuses/show?id={tweet_id}" + yield Request(url, callback=self.parse) + + def parse(self, response, **kwargs): + """ + 网页解析 + """ + data = json.loads(response.text) + item = parse_tweet_info(data) + if item['isLongText']: + url = "https://weibo.com/ajax/statuses/longtext?id=" + item['mblogid'] + yield Request(url, callback=parse_long_tweet, meta={'item': item}) + yield item diff --git a/weibospider/spiders/tweet.py b/weibospider/spiders/tweet_by_user_id.py similarity index 95% rename from weibospider/spiders/tweet.py rename to weibospider/spiders/tweet_by_user_id.py index 35dbdf16..350a3c9f 100644 --- a/weibospider/spiders/tweet.py +++ b/weibospider/spiders/tweet_by_user_id.py @@ -11,11 +11,11 @@ from spiders.common import parse_tweet_info, parse_long_tweet -class TweetSpider(Spider): +class TweetSpiderByUserID(Spider): """ 用户推文数据采集 """ - name = "tweet_spider" + name = "tweet_spider_by_user_id" base_url = "https://weibo.cn" def start_requests(self):