#272

nghuyong · Apr 29, 2023 · 1a3d3d7 · 1a3d3d7
1 parent 47d7578
commit 1a3d3d7
Show file tree

Hide file tree

Showing 6 changed files with 121 additions and 36 deletions.
diff --git a/README.md b/README.md
@@ -180,32 +180,6 @@ python run_spider.py follow
 ...
 ```
 
-### 用户的微博采集
-
-```bash
-python run_spider.py tweet
-```
-
-```json
-{
-  "crawl_time": 1666864583,
-  "_id": "4762810834227120",
-  "mblogid": "LqlZNhJFm",
-  "created_at": "2022-04-27 10:20:54",
-  "geo": null,
-  "ip_location": null,
-  "reposts_count": 1907,
-  "comments_count": 1924,
-  "attitudes_count": 12169,
-  "source": "三星Galaxy S22 Ultra",
-  "content": "生于乱世纵横四海，义之所在不计生死，孤勇者陈恭一生当如是。#风起陇西今日开播# #风起陇西#  今晚，恭候你！",
-  "pic_urls": [],
-  "pic_num": 0,
-  "video": "http://f.video.weibocdn.com/o0/CmQEWK1ylx07VAm0nrxe01041200YDIc0E010.mp4?label=mp4_720p&template=1280x720.25.0&ori=0&ps=1CwnkDw1GXwCQx&Expires=1666868183&ssig=RlIeOt286i&KID=unistore,video",
-  "url": "https://weibo.com/1087770692/LqlZNhJFm"
-}
-...
-```
 
 ### 微博评论采集
 
@@ -273,10 +247,75 @@ python run_spider.py repost
 ...
 ```
 
+### 基于推文ID的推文采集
+
+```bash
+python run_spider.py tweet_by_tweet_id
+```
+
+```json
+{
+    "_id": "4762810834227120",
+    "mblogid": "LqlZNhJFm",
+    "created_at": "2022-04-27 10:20:54",
+    "geo": null,
+    "ip_location": null,
+    "reposts_count": 1890,
+    "comments_count": 1924,
+    "attitudes_count": 12167,
+    "source": "三星Galaxy S22 Ultra",
+    "content": "生于乱世纵横四海，义之所在不计生死，孤勇者陈恭一生当如是。#风起陇西今日开播# #风起陇西#  今晚，恭候你！",
+    "pic_urls": [],
+    "pic_num": 0,
+    "isLongText": false,
+    "user": {
+        "_id": "1087770692",
+        "avatar_hd": "https://tvax1.sinaimg.cn/crop.0.0.1080.1080.1024/40d61044ly8gbhxwgy419j20u00u0goc.jpg?KID=imgbed,tva&Expires=1682768013&ssig=r1QurGoc2L",
+        "nick_name": "陈坤",
+        "verified": true,
+        "mbrank": 7,
+        "mbtype": 12,
+        "verified_type": 0
+    },
+    "video": "http://f.video.weibocdn.com/o0/CmQEWK1ylx07VAm0nrxe01041200YDIc0E010.mp4?label=mp4_720p&template=1280x720.25.0&ori=0&ps=1CwnkDw1GXwCQx&Expires=1682760813&ssig=26udcPSXFJ&KID=unistore,video",
+    "url": "https://weibo.com/1087770692/LqlZNhJFm",
+    "crawl_time": 1682757213
+}
+...
+```
+
+### 用户的微博采集
+
+```bash
+python run_spider.py tweet_by_user_id
+```
+
+```json
+{
+  "crawl_time": 1666864583,
+  "_id": "4762810834227120",
+  "mblogid": "LqlZNhJFm",
+  "created_at": "2022-04-27 10:20:54",
+  "geo": null,
+  "ip_location": null,
+  "reposts_count": 1907,
+  "comments_count": 1924,
+  "attitudes_count": 12169,
+  "source": "三星Galaxy S22 Ultra",
+  "content": "生于乱世纵横四海，义之所在不计生死，孤勇者陈恭一生当如是。#风起陇西今日开播# #风起陇西#  今晚，恭候你！",
+  "pic_urls": [],
+  "pic_num": 0,
+  "video": "http://f.video.weibocdn.com/o0/CmQEWK1ylx07VAm0nrxe01041200YDIc0E010.mp4?label=mp4_720p&template=1280x720.25.0&ori=0&ps=1CwnkDw1GXwCQx&Expires=1666868183&ssig=RlIeOt286i&KID=unistore,video",
+  "url": "https://weibo.com/1087770692/LqlZNhJFm"
+}
+...
+```
+
+
 ### 基于关键词的微博检索
 
 ```bash
-python run_spider.py search
+python run_spider.py tweet_by_keyword
 ```
 
 ```json
@@ -333,8 +372,9 @@ python run_spider.py search
 
 ## 更新日志
 
+- 2023.04: 支持针对推文id的推文采集 [#272](https://github.com/nghuyong/WeiboSpider/issues/272)
 - 2022.11: 支持针对单个关键词获取单天超过1200页的检索结果 [#257](https://github.com/nghuyong/WeiboSpider/issues/257)
-- 2022.11: 支持长微博全文的获取
+- 2022.11: 支持长微博全文的获取 []
 - 2022.11: 基于关键词微博搜索支持指定时间范围
 - 2022.10: 添加IP归属地信息的采集，包括用户数据，微博数据和微博评论数据
 - 2022.10: 基于weibo.com站点对项目进行重构

diff --git a/weibospider/run_spider.py b/weibospider/run_spider.py
@@ -9,13 +9,14 @@
 import sys
 from scrapy.crawler import CrawlerProcess
 from scrapy.utils.project import get_project_settings
-from spiders.tweet import TweetSpider
+from spiders.tweet_by_user_id import TweetSpiderByUserID
+from spiders.tweet_by_keyword import TweetSpiderByKeyword
+from spiders.tweet_by_tweet_id import TweetSpiderByTweetID
 from spiders.comment import CommentSpider
 from spiders.follower import FollowerSpider
 from spiders.user import UserSpider
 from spiders.fan import FanSpider
 from spiders.repost import RepostSpider
-from spiders.search import SearchSpider
 
 if __name__ == '__main__':
     mode = sys.argv[1]
@@ -26,10 +27,11 @@
         'comment': CommentSpider,
         'fan': FanSpider,
         'follow': FollowerSpider,
-        'tweet': TweetSpider,
         'user': UserSpider,
         'repost': RepostSpider,
-        'search': SearchSpider
+        'tweet_by_tweet_id': TweetSpiderByTweetID,
+        'tweet_by_user_id': TweetSpiderByUserID,
+        'tweet_by_keyword': TweetSpiderByKeyword,
     }
     process.crawl(mode_to_spider[mode])
     # the script will block here until the crawling is finished

diff --git a/weibospider/spiders/common.py b/weibospider/spiders/common.py
@@ -5,6 +5,7 @@
 Created Time: 2022/10/24
 """
 import json
+import re
 
 import dateutil.parser
 
@@ -102,6 +103,8 @@ def parse_tweet_info(data):
         'isLongText': False,
         "user": parse_user_info(data['user']),
     }
+    if '</a>' in tweet['source']:
+        tweet['source'] = re.search(r'>(.*?)</a>', tweet['source']).group(1)
     if 'page_info' in data and data['page_info'].get('object_type', '') == 'video':
         tweet['video'] = data['page_info']['media_info']['mp4_720p_mp4']
     tweet['url'] = f"https://weibo.com/{tweet['user']['_id']}/{tweet['mblogid']}"

diff --git a/weibospider/spiders/search.py → weibospider/spiders/tweet_by_keyword.py b/weibospider/spiders/search.py → weibospider/spiders/tweet_by_keyword.py
@@ -10,11 +10,11 @@
 from spiders.common import parse_tweet_info, parse_long_tweet
 
 
-class SearchSpider(Spider):
+class TweetSpiderByKeyword(Spider):
     """
     关键词搜索采集
     """
-    name = "search_spider"
+    name = "tweet_spider_by_keyword"
     base_url = "https://s.weibo.com/"
 
     def start_requests(self):

diff --git a/weibospider/spiders/tweet_by_tweet_id.py b/weibospider/spiders/tweet_by_tweet_id.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+# encoding: utf-8
+"""
+Author: nghuyong
+Mail: nghuyong@163.com
+Created Time: 2020/4/14
+"""
+import json
+from scrapy import Spider
+from scrapy.http import Request
+from spiders.common import parse_tweet_info, parse_long_tweet
+
+
+class TweetSpiderByTweetID(Spider):
+    """
+    用户推文ID采集推文
+    """
+    name = "tweet_spider_by_tweet_id"
+    base_url = "https://weibo.cn"
+
+    def start_requests(self):
+        """
+        爬虫入口
+        """
+        # 这里user_ids可替换成实际待采集的数据
+        tweet_ids = ['LqlZNhJFm']
+        for tweet_id in tweet_ids:
+            url = f"https://weibo.com/ajax/statuses/show?id={tweet_id}"
+            yield Request(url, callback=self.parse)
+
+    def parse(self, response, **kwargs):
+        """
+        网页解析
+        """
+        data = json.loads(response.text)
+        item = parse_tweet_info(data)
+        if item['isLongText']:
+            url = "https://weibo.com/ajax/statuses/longtext?id=" + item['mblogid']
+            yield Request(url, callback=parse_long_tweet, meta={'item': item})
+        yield item
diff --git a/weibospider/spiders/tweet.py → weibospider/spiders/tweet_by_user_id.py b/weibospider/spiders/tweet.py → weibospider/spiders/tweet_by_user_id.py
@@ -11,11 +11,11 @@
 from spiders.common import parse_tweet_info, parse_long_tweet
 
 
-class TweetSpider(Spider):
+class TweetSpiderByUserID(Spider):
     """
     用户推文数据采集
     """
-    name = "tweet_spider"
+    name = "tweet_spider_by_user_id"
     base_url = "https://weibo.cn"
 
     def start_requests(self):