-
Notifications
You must be signed in to change notification settings - Fork 11
/
RedditArchiver.py
456 lines (363 loc) · 22.6 KB
/
RedditArchiver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
# 3rd party modules
from anytree import Node, PreOrderIter
from anytree import util as anytree_util
import praw, prawcore, markdown2, yaml, colored
# stdlib
import datetime, os, sys, argparse, re
__NAME__ = "RedditArchiver-standalone"
__VERSION__ = "2.0.2"
# -------------------------- #
# Arguments #
# -------------------------- #
parser = argparse.ArgumentParser(description="Standalone version of RedditArchiver. Lets you download Reddit threads in a nicely readable HTML file.", add_help=False)
parser_g1 = parser.add_argument_group(title='Selection', description="Use at least one of these options to select what you want to save. Arguments can be used several times to specify more than one ID, URL or author.")
parser_g1.add_argument('-i', '--id', help='ID or URL of a submission', metavar='ID/URL', action='append')
parser_g1.add_argument('-I', '--file', help='same as -i, but reads IDs or URLs from a file', metavar='file', action='append')
parser_g1.add_argument('-s', '--saved', help='your saved submissions', action="store_true")
parser_g1.add_argument('-S', '--saved-extended', help='same as -s, but also saves the submissions that you saved a comment from', action="store_true")
parser_g1.add_argument('-a', '--author', help='submissions posted from someone (by default: yourself)', metavar="name", nargs='?', action='append')
parser_g1.add_argument('-A', '--author-extended', help='same as -a, but also saves the submissions where the person posted a comment in', metavar="name", nargs='?', action='append')
parser_g1.add_argument('-u', '--upvoted', help='submissions that you upvoted', action="store_true")
parser_g2 = parser.add_argument_group(title='Various', description="Other options controlling various things such as configuration, output directory...")
parser_g2.add_argument('-l', '--limit', help='limits the number of submissions retrieved with -s/-S, -a/-A and -u (newest first). Please note that the maximum is 1000 and Reddit will refuse to give anything past this limit.', metavar="N", type=int, default=1000)
parser_g2.add_argument('-c', '--config', help='uses a different config file (default: ./config.yml).', metavar='path', default="./config.yml")
parser_g2.add_argument('-o', '--output', help='output directory (default: current directory)', metavar='path', default="./")
parser_g2.add_argument('-q', '--quiet', help='will not generate any message (except for errors)', action='store_true')
parser_g2.add_argument('-h', '--help', action='help', help='Show this help message and exit')
# Advanced arguments (normally hidden)
parser.add_argument('--disable-recursion-limit', help=argparse.SUPPRESS, action='store_true')
args = parser.parse_args()
# -------------------------- #
# Functions #
# -------------------------- #
def extract_id(url):
"""
Extracts the submission ID from a supplied URL
"""
regexes = (r"^([a-z0-9]+)/?$",
r"^https?:\/\/(?:old|new|www)?\.reddit\.com\/([a-z0-9]+)\/?$",
r"^https?:\/\/(?:old|new|www)?\.reddit\.com\/r\/[a-zA-Z0-9\-_]+\/comments\/([a-z0-9]+)\/?")
for regex in regexes:
result = re.search(regex, url)
if result is not None:
return result.group(1)
def connect():
"""
Initiates and tests the connection to Reddit.
"""
reddit = praw.Reddit(client_id=config['reddit']['client-id'], client_secret=config['reddit']['client-secret'], refresh_token=config['reddit']['refresh-token'], user_agent=f"{__NAME__} v{__VERSION__} by /u/ailothaen")
reddit.auth.scopes()
return reddit
def get_saved_submissions(extended=False, limit=1000):
"""
Retrieves the list of saved submissions IDs of the authenticated user.
If extended is True: returns as well the submissions which the user saved a comment from
"""
submission_ids = []
for item in reddit.user.me().saved(limit=limit):
if item.name[0:2] == "t3":
submission_ids.append(item.id)
elif item.name[0:2] == "t1" and extended:
submission_ids.append(item.link_id[3:])
return submission_ids
def get_upvoted_submissions(limit=1000):
"""
Retrieves the list of upvoted submissions IDs of the authenticated user.
"""
submission_ids = []
for item in reddit.user.me().upvoted(limit=limit):
# Reddit seemingly does not give upvoted comments
if item.name[0:2] == "t3":
submission_ids.append(item.id)
return submission_ids
def get_posted_submissions(author=None, extended=False, limit=1000):
"""
Retrieves the list of submissions IDs posted by the authenticated user.
If extended is True: returns as well the submissions which the user posted a comment into
"""
submission_ids = []
if author is None:
user = reddit.user.me()
else:
user = reddit.redditor(author)
if extended:
for item in user.submissions.new(limit=limit*2):
submission_ids.append([item.id, item.created_utc])
for item in user.comments.new(limit=limit*2):
submission_ids.append([item.submission.id, item.created_utc])
# keeping only the <limit> newest comments/submissions
submission_ids.sort(key=lambda x: x[1], reverse=True)
submission_ids = [item[0] for item in submission_ids[:10]]
else:
for item in user.submissions.new(limit=limit):
submission_ids.append(item.id)
return submission_ids
def comment_parser(initial_text):
"""
Parses Reddit's pseudo-markdown into HTML formatting
"""
# removing HTML characters
text = initial_text.replace('<', '<')
text = text.replace('>', '>')
# transforming markdown to HTML
text = markdown2.markdown(text)
# converting linebreaks to HTML
text = text.replace('\n\n', '</p><p>')
text = text.replace('\n', '<br>')
# removing the last <br> that is here for a weird reason
if text[-4:] == '<br>':
text = text[:-4]
return text
def get_submission(reddit, submission_id):
"""
Retrieves submission object
"""
submission = reddit.submission(id=submission_id)
nb_replies = submission.num_comments
return submission, nb_replies
def download_submission(submission, submission_id):
"""
Retrieves the submission and its comments from Reddit API.
Returns two lists, one being a flat list of comments with their attributes (comments_forest), the other one being the tree structure of the submission (comments_index)
"""
# Contains all the node objects (for the tree structure)
comments_index = {}
# Contains all the comment objects
comments_forest = {}
# Creating root node: the submission itself
comments_index['t3_'+submission_id] = Node('t3_'+submission_id)
# Getting all comments in tree order, according to the sorting algorithm defined.
# See https://praw.readthedocs.io/en/latest/tutorials/comments.html#extracting-comments
submission.comments.replace_more(limit=None)
# Filling index and forest
comment_queue = submission.comments[:]
while comment_queue:
comment = comment_queue.pop(0)
comments_index['t1_'+comment.id] = Node('t1_'+comment.id, parent=comments_index[comment.parent_id])
comments_forest['t1_'+comment.id] = {'a': '(deleted)' if comment.author is None else comment.author.name, 'b': '(deleted)' if comment.body is None else comment.body, 'd': comment.distinguished, 'e': comment.edited, 'l': comment.permalink ,'o': comment.is_submitter, 's': comment.score, 't': comment.created_utc}
comment_queue.extend(comment.replies)
return submission, comments_index, comments_forest
def generate_html(submission, submission_id, now_str, sort, comments_index, comments_forest):
"""
Generates HTML structure with the submission, its replies and all its info in it.
Note: As now, "sort" is unused. Todo?
"""
# Beginning of file, with <head> section
html_head = f"""<!doctype html><html><head><meta charset="utf-8"/><title>{submission.subreddit.display_name} – {submission.title}</title><style>html{{font-family: 'Arial', 'Helvetica', sans-serif;font-size: 15px;box-sizing: border-box;}}div{{margin: 0px -5px 0px 0px;padding: 5px;}}header{{font-weight: bold;}}.f{{margin-top: 15px;}}.o{{background-color: #eaeaea;}}.e{{background-color: #fafafa;}}.l1{{border-left: 4px solid #3867d6;}}.l1 > header, .l1 > a, .l1 > header a{{color: #3867d6;}}.l2{{border-left: 4px solid #e74c3c;}}.l2 > header, .l2 > a, .l2 > header a{{color: #e74c3c;}}.l3{{border-left: 4px solid #20bf6b;}}.l3 > header, .l3 > a, .l3 > header a{{color: #20bf6b;}}.l4{{border-left: 4px solid #f7b731;}}.l4 > header, .l4 > a, .l4 > header a{{color: #f7b731;}}.l5{{border-left: 4px solid #9b59b6;}}.l5 > header, .l5 > a, .l5 > header a{{color: #9b59b6;}}.l6{{border-left: 4px solid #fa8231;}}.l6 > header, .l6 > a, .l6 > header a{{color: #fa8231;}}.l7{{border-left: 4px solid #a5b1c2;}}.l7 > header, .l7 > a, .l7 > header a{{color: #a5b1c2;}}.l8{{border-left: 4px solid #4b6584;}}.l8 > header, .l8 > a, .l8 > header a{{color: #4b6584;}}.l9{{border-left: 4px solid #0fb9b1;}}.l9 > header, .l9 > a, .l9 > header a{{color: #0fb9b1;}}.l0{{border-left: 4px solid #fd79a8;}}.l0 > header, .l0 > a, .l0 > header a{{color: #fd79a8;}}.m{{background-color: #c8ffc8;}}.a{{background-color: #ffdcd2;}}.p{{background-color: #b4c8ff;}}.n{{text-decoration: none;}}.D{{cursor:not-allowed!important;color:#ccc!important;}}</style></head><body>"""
# Header of file, with submission info
html_submission = f"""<h1><a href="{config['reddit']['root']}/r/{submission.subreddit.display_name}/">/r/{submission.subreddit.display_name}</a> – <a href="{config['reddit']['root']}{submission.permalink}">{submission.title}</a></h1><h2>Snapshot taken on {now_str}<br/>Posts: {submission.num_comments} – Score: {submission.score} ({int(submission.upvote_ratio*100)}% upvoted) – Flair: {'None' if submission.link_flair_text is None else submission.link_flair_text} – Sorted by: {sort}<br/>Sticky: {'No' if submission.stickied is False else 'Yes'} – Spoiler: {'No' if submission.spoiler is False else 'Yes'} – NSFW: {'No' if submission.over_18 is False else 'Yes'} – OC: {'No' if submission.is_original_content is False else 'Yes'} – Locked: {'No' if submission.locked is False else 'Yes'}</h2><p><em>Snapshot taken from {__NAME__} v{__VERSION__}. All times are UTC.</em></p>"""
# First comment (which is actually OP's post)
html_firstpost = f"""<h3>Original post</h3><div class="b p f l1" id="t3_{submission_id}"><header><a href="{config['reddit']['root']}/u/{'(deleted)' if submission.author is None else submission.author.name}">{'(deleted)' if submission.author is None else submission.author.name}</a>, on {datetime.datetime.fromtimestamp(submission.created_utc).strftime(config["defaults"]["dateformat"])}</header>{comment_parser(submission.selftext)}</div><h3>Comments</h3>"""
# Iterating through the tree to put comments in right order
html_comments = ''
previous_comment_level = 1 # We begin at level 1.
comment_counter = 1 # Comment counter
#for comment in generator:
for node in PreOrderIter(comments_index['t3_'+submission_id]):
current_comment_level = node.depth
current_comment_id = node.name
if node.name[:2] == 't3': # root is the submission itself, we ignore it
continue
# We close as much comments as we need to.
# Is this is a sibling (= same level), we just close one comment.
# If this is on another branch, we close as much comments as we need to to close the branch.
if current_comment_level <= previous_comment_level:
for i in range(0, previous_comment_level-current_comment_level+1):
html_comments += '</div>'
# CSS classes to be applied.
classes = ''
# If first-level comment, we put a margin
if current_comment_level == 1:
classes += 'f '
if comments_forest[current_comment_id]['d'] == 'admin':
classes += 'a ' # Distinguished administrator post color
elif comments_forest[current_comment_id]['d'] == 'moderator':
classes += 'm ' # Distinguished moderator post color
elif comments_forest[current_comment_id]['o']:
classes += 'p ' # OP post color
elif current_comment_level % 2 == 0:
classes += 'e ' # Even post color
else:
classes += 'o ' # Odd post color
# Post level
classes += 'l'+str(current_comment_level)[-1] # only taking the last digit
html_comments += f'<div class="{classes}" id="{current_comment_id}">'
# Getting parents and siblings for easy navigation
try:
previous_sibling = anytree_util.leftsibling(node).name
previous_sibling_d = ''
except AttributeError: # first sibling
previous_sibling = ''
previous_sibling_d = ' D' # class "disabled" for first and last siblings
try:
next_sibling = anytree_util.rightsibling(node).name
next_sibling_d = ''
except AttributeError: # last sibling
next_sibling = ''
next_sibling_d = ' D'
parent = node.parent.name
time_comment = datetime.datetime.fromtimestamp(comments_forest[current_comment_id]['t'])
time_comment_str = time_comment.strftime(config["defaults"]["dateformat"])
# Adding the comment to the list
html_comments += f"""<header><a href="{config['reddit']['root']}/u/{comments_forest[current_comment_id]['a']}">{comments_forest[current_comment_id]['a']}</a>, on <a href="{config['reddit']['root']}{comments_forest[current_comment_id]['l']}">{time_comment_str}</a> ({comments_forest[current_comment_id]['s']}{'' if comments_forest[current_comment_id]['e'] is False else ', edited'}) <a href="#{parent}" class="n P">▣</a> <a href="#{previous_sibling}" class="n A{previous_sibling_d}">🠉</a> <a href="#{next_sibling}" class="n B{next_sibling_d}">🠋</a> <a href="#{current_comment_id}" class="n S">◯</a></header>{comment_parser(comments_forest[current_comment_id]['b'])}"""
previous_comment_level = current_comment_level
comment_counter += 1
# JS managing scrolling features
html_js = '<script>function checkKey(e){"38"==(e=e||window.event).keyCode?(e.preventDefault(),scrollToSibling("A")):"40"==e.keyCode?(e.preventDefault(),scrollToSibling("B")):"37"!=e.keyCode&&"80"!=e.keyCode||scrollToParent()}function scrollToSibling(e){var o,t=window.location.hash.substr(1),n=document.getElementById(t).getElementsByClassName(e)[0];n.classList.contains("D")||(o=n.getAttribute("href").substr(1),document.getElementById(o).scrollIntoView(!0),window.location.hash=o)}function scrollToParent(){var e=window.location.hash.substr(1);document.getElementById(e).parentNode.id.scrollIntoView(!0),window.location.hash=target_id}document.onkeydown=checkKey;</script>'
# Merging this all together
html_total = html_head+html_submission+html_firstpost+html_comments+html_js
return html_total
def write_file(content, submission, now, output_directory):
"""
Writes the HTML content into a file. Returns the filename
"""
# keeping the submission name in URL
sanitized_name = submission.permalink.split('/')[-2]
# Reducing filename to 200 characters
sanitized_name = (sanitized_name[:150]) if len(sanitized_name) > 150 else sanitized_name
path = os.path.join(output_directory, f"{submission.subreddit.display_name}-{sanitized_name}-{now.strftime('%Y%m%d-%H%M%S')}.html")
filename = f"{submission.subreddit.display_name}-{sanitized_name}-{now.strftime('%Y%m%d-%H%M%S')}.html"
f = open(path, "wb")
f.write(content.encode('utf-8'))
f.close()
return filename
def myprint(message, color, stderr=False):
"""
Easy wrapper for print
"""
if stderr:
print(f"{colored.fg(color)}{message}{colored.attr(0)}", file=sys.stderr)
else:
if args.quiet:
return None
else:
print(f"{colored.fg(color)}{message}{colored.attr(0)}")
# -------------------------- #
# Config loading #
# -------------------------- #
try:
with open(args.config, 'r') as f:
config = yaml.safe_load(f)
except:
myprint(f"[x] Cannot load config file. Make sure it exists and the syntax is correct.", 9, True)
raise SystemExit(1)
# -------------------------- #
# Main function #
# -------------------------- #
try:
now = datetime.datetime.now(datetime.timezone.utc)
now_str = now.strftime(config["defaults"]["dateformat"])
# Test connection
try:
reddit = connect()
except:
myprint(f"[x] It looks like you are not authenticated well.", 9, True)
myprint(f"[x] Please check your credentials and retry.", 9, True)
raise SystemExit(1)
# Getting the list of submissions IDs from arguments
submission_id_list = []
if args.id:
for url in args.id:
submission_id = extract_id(url)
if submission_id is None:
myprint(f'[x] The URL or ID "{url}" looks incorrect. Please check it.', 9, True)
raise SystemExit(1)
else:
submission_id_list.append(submission_id)
if args.file:
for filename in args.file:
with open(filename, "r") as f:
urls_in_file = f.read().splitlines()
for url in urls_in_file:
submission_id = extract_id(url)
if submission_id is None:
myprint(f'[x] The URL or ID "{url}" in "{filename}" looks incorrect. Please check it.', 9, True)
raise SystemExit(1)
else:
submission_id_list.append(submission_id)
if args.saved or args.saved_extended:
try:
saved_submissions = get_saved_submissions(extended=args.saved_extended, limit=args.limit)
submission_id_list.extend(saved_submissions)
except prawcore.exceptions.InsufficientScope:
myprint(f'[x] Unable to get your list of saved submissions. That usually means that you did not grant RedditArchiver enough access to your account.', 9, True)
myprint(f'[x] Please get a new refresh token, making sure to allow the following scopes: read, history, identity', 9, True)
else:
myprint(f'[i] {len(saved_submissions)} saved submissions found.', 14)
if args.author:
for author in args.author:
try:
posted_submissions = get_posted_submissions(author, extended=False, limit=args.limit)
except prawcore.exceptions.NotFound:
myprint(f'[x] User "{author if author else "<yourself>"}" was not found.', 9, True)
else:
myprint(f'[i] {len(posted_submissions)} submissions found on user "{author if author else "<yourself>"}".', 14)
submission_id_list.extend(posted_submissions)
if args.author_extended:
for author in args.author_extended:
try:
posted_submissions = get_posted_submissions(author, extended=True, limit=args.limit)
except prawcore.exceptions.NotFound:
myprint(f'[x] User "{author if author else "<yourself>"}" was not found.', 9, True)
else:
myprint(f'[i] {len(posted_submissions)} submissions found on user "{author if author else "<yourself>"}".', 14)
submission_id_list.extend(posted_submissions)
if args.upvoted:
try:
upvoted_submissions = get_upvoted_submissions(limit=args.limit)
submission_id_list.extend(upvoted_submissions)
except prawcore.exceptions.InsufficientScope:
myprint(f'[x] Unable to get your list of upvoted submissions. That usually means that you did not grant RedditArchiver enough access to your account.', 9, True)
myprint(f'[x] Please get a new refresh token, making sure to allow the following scopes: read, history, identity', 9, True)
else:
myprint(f'[i] {len(upvoted_submissions)} upvoted submissions found.', 14)
# Downloading each submission
submission_id_list = list(dict.fromkeys(submission_id_list)) # removing duplicates
if len(submission_id_list) == 0:
myprint(f'[=] Nothing to download.', 10)
raise SystemExit(0)
else:
myprint(f'[i] {len(submission_id_list)} submissions to download', 14)
for submission_id in submission_id_list:
try:
# "Connecting" to submission and getting information
submission, nb_replies = get_submission(reddit, submission_id)
myprint(f'[+] Submission {submission_id} found ("{submission.title}" on r/{submission.subreddit.display_name}, {nb_replies} replies), beginning download', 8)
# Getting the comment list and comment forest
submission, comments_index, comments_forest = download_submission(submission, submission_id)
except prawcore.exceptions.NotFound:
myprint(f"[x] The submission {submission_id} was not found.", 9, True)
continue
except prawcore.exceptions.Forbidden:
myprint(f"[x] Not allowed to access the submission {submission_id}. That usually means the submission is on a private subreddit you do not have access to.", 9, True)
continue
else:
myprint(f"[+] Submission downloaded.", 8)
# Generating HTML structure
while True: # allows to retry
try:
html = generate_html(submission, submission_id, now_str, None, comments_index, comments_forest)
except RecursionError:
if args.disable_recursion_limit:
sys.setrecursionlimit(sys.getrecursionlimit()*2)
else:
myprint(f"[x] The HTML structure could not be generated because the structure of the replies is going too deep for the program to handle.", 9, True)
myprint(f"[x] If you really want to save that thread, pass the --disable-recursion-limit option. Please note however that this might lead to crashing the program.", 9, True)
raise SystemExit(1)
else:
break
myprint(f"[+] Submission structured.", 8)
# Saving to disk
try:
filename = write_file(html, submission, now, args.output)
except PermissionError as e:
myprint(f"[x] Could not write file because of bad permissions.", 9, True)
raise SystemExit(1)
except Exception as e:
myprint(f"[x] Uncaught problem when writing the file: {e}", 9, True)
raise SystemExit(1)
myprint(f"[=] Submission saved! Filename: {filename}", 10)
except Exception as e:
# general catch
myprint(f"[x] Uncaught problem: {e}", 9, True)
raise