-
Notifications
You must be signed in to change notification settings - Fork 0
/
comment.py
186 lines (141 loc) · 5.27 KB
/
comment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
from dataclasses import dataclass
from datetime import datetime, timezone
import json
import os
from typing import Optional
from bs4 import BeautifulSoup
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.by import By
from post import get_post_id
def _get_comment_id(url: str) -> str:
return url.split("/")[-1].split("?")[-1]
@dataclass
class Comment:
author: Optional[str]
relative_date: Optional[str]
member_length: Optional[str]
likes: Optional[str]
is_hearted: bool
is_pinned: bool
contents: Optional[str]
replies: Optional[str]
link: Optional[str]
when_archived: str
def save(self, output_dir: str, post_url: str):
id = get_post_id(post_url)
comment_dir = os.path.join(output_dir, id, "comments")
if not os.path.exists(comment_dir):
try:
os.mkdir(comment_dir)
except:
print(f"err: couldn't make directory at {comment_dir}")
return
comment_id = _get_comment_id(self.link) if self.link else "unknown"
comment_path = os.path.join(comment_dir, f"{comment_id}.json")
try:
with open(comment_path, "w", encoding="utf-8") as f:
json.dump(
self.__dict__,
f,
ensure_ascii=False,
indent=4,
default=lambda o: o.__dict__,
skipkeys=True,
)
except:
print(f"err: couldn't save comment data dump at {comment_path}")
def _get_author(comment: WebElement) -> Optional[str]:
possible_author = comment.find_elements(By.ID, "author-text")
if possible_author:
author = possible_author[0].text.strip()
if author:
return author
possible_author = comment.find_elements(By.ID, "channel-name")
if possible_author:
possible_author = possible_author[0].find_elements(
By.TAG_NAME, "yt-formatted-string"
)
author = possible_author[0].text.strip()
return author if author else None
return None
def _get_relative_date(comment: WebElement) -> Optional[str]:
possible_relative_date = comment.find_elements(By.ID, "published-time-text")
relative_date = possible_relative_date[0].text.strip()
return relative_date if relative_date else None
def _get_member_length(comment: WebElement) -> Optional[str]:
possible_member_length = comment.find_elements(By.ID, "custom-badge")
if not possible_member_length:
return None
possible_text = possible_member_length[0].find_elements(
By.TAG_NAME, "yt-img-shadow"
)
if not possible_text:
return None
length = possible_text[0].get_attribute("shared-tooltip-text").strip()
return length if length else None
def _get_likes(comment: WebElement) -> Optional[str]:
possible_likes = comment.find_elements(By.ID, "vote-count-middle")
if not possible_likes:
return None
likes = possible_likes[0].text.strip()
return likes if likes else None
def _get_is_hearted(comment: WebElement) -> bool:
possible_hearts = comment.find_elements(By.ID, "creator-heart-button")
if not possible_hearts:
return False
return possible_hearts[0].is_displayed()
def _get_is_pinned(comment: WebElement) -> bool:
possible_pins = comment.find_elements(By.ID, "pinned-comment-badge")
if not possible_pins:
return False
return possible_pins[0].is_displayed()
def _get_contents(comment: WebElement) -> Optional[str]:
# This is SO hardcoded holy SHIT. Definitely test this one.
# Also I could probably do this all with bs4 but for whatever
# reason we get this mess lol.
comment_html = comment.get_attribute("innerHTML")
if not comment_html:
return None
soup = BeautifulSoup(comment_html, "html.parser")
content_wrapper = soup.find(id="content-text")
if not content_wrapper:
return None
content_wrapper = content_wrapper.find("span")
if not content_wrapper:
return None
text = ""
for c in content_wrapper.descendants:
if c.name == "img":
emote_name = c.get("alt")
if emote_name:
text += f"<::{emote_name}::>"
elif c.name is None:
text += c.text
return text
def _get_replies(comment: WebElement) -> Optional[str]:
possible_replies = comment.find_elements(By.ID, "more-replies")
if not possible_replies:
return None
replies = possible_replies[0].text.strip()
return replies if replies else None
def build_comment(comment: WebElement, link: str) -> Comment:
author = _get_author(comment)
relative_date = _get_relative_date(comment)
member_length = _get_member_length(comment)
likes = _get_likes(comment)
is_hearted = _get_is_hearted(comment)
is_pinned = _get_is_pinned(comment)
contents = _get_contents(comment)
replies = _get_replies(comment)
return Comment(
author=author,
relative_date=relative_date,
member_length=member_length,
likes=likes,
is_hearted=is_hearted,
is_pinned=is_pinned,
contents=contents,
replies=replies,
link=link,
when_archived=str(datetime.now(tz=timezone.utc)),
)