-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathytarchiver.py
executable file
·443 lines (388 loc) · 17.1 KB
/
ytarchiver.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
#!/usr/bin/env python3
''' ytarchiver - download and archive youtube videos or playlists '''
import os
import sys
import re
import argparse
import time
import sqlite3
import json
import random
import yt_dlp
from yt_dlp.utils import read_batch_urls as readBatchURLs
from yt_dlp.utils import match_filter_func as matchFilterFunc
from requests.exceptions import RequestException
import ytacommon as yta
from ytapost import PostHook
import ytainfo
import ytameta
# --------------------------------------------------------------------------- #
def archive(args, parsed=False):
'''Archive youtube videos or playlists
:param args: The command line arguments given by the user
:type args: list
'''
#Parse arguments
if not parsed:
parser = argparse.ArgumentParser(prog="ytarchiver", description="Download and archive Youtube videos or playlists")
parser.add_argument("-a", "--all", action="store_const", dest="all", const=True, default=False, help="Run archiver for all subdirectories with archive databases. In this mode, LANG and VIDEO will always be read from the databases")
parser.add_argument("-c", "--check", action="store_const", dest="check", const="-c", default="", help="Check each file after download")
group = parser.add_mutually_exclusive_group()
group.add_argument("-s", "--statistics", action="store_const", dest="statistics", const=True, default=False, help="Update the video statistics")
group.add_argument("-u", "--captions", action="store_const", dest="captions", const=True, default=False, help="List videos where captions were added since archiving (forces -s)")
group.add_argument("-x", "--amendcaptions", action="store_const", dest="amendcaptions", const=True, default=False, help="Download captions were they were added since archiving (forces -u and consequently -s)")
parser.add_argument("-r", "--replace", action="store_const", dest="replace", const="-r", default="", help="Replace an existing video (a video ID has to be provided)")
group = parser.add_mutually_exclusive_group()
group.add_argument("-8k", "--8K", action="store_const", dest="quality", const="8k", help="Limit download resolution to 8K")
group.add_argument("-4k", "--4K", action="store_const", dest="quality", const="4k", help="Limit download resolution to 4K (default)")
group.add_argument("-hd", "--HD", action="store_const", dest="quality", const="hd", help="Limit download resolution to full HD")
parser.add_argument("-V", "--version", action="version", version='%(prog)s {}'.format(yta.__version__))
parser.add_argument("DIR", help="The directory to work in")
parser.add_argument("LANG", nargs='?', help="The video language (read from the database if not given)")
parser.add_argument("-f", "--file", action="store", dest="file", help="Read IDs to archive from a batch file with one ID per line")
group.add_argument("--filter", action="store", dest="filter", default=None, help="Filter videos to download using Youtube-dl's match filter option")
parser.add_argument("VIDEO", nargs='?', help="The Youtube video or playlist ID (read from the database if not given)")
args = parser.parse_args(args)
if args.all and args.file:
parser.error("-a cannot be used in combination with batch file")
if args.all and args.replace:
parser.error("-a cannot be used in combination with replace")
#Check if API key provided
yta.getAPIKey(True)
#Archive all subdirectories
if args.all:
archiveAll(args)
return
#Validate path
path = os.path.normpath(os.path.abspath(args.DIR))
if not os.path.isdir(path):
parser.error("An existing directory must be specified")
#Check if database exists
dbPath = os.path.join(path, "archive.db")
if not os.path.isfile(dbPath):
#No database found, ask to create one
while True:
q = input("New archive. Populate with channel info? [Y|n] ")
if not q:
q = 'y'
a = q[0].lower()
if a in ['y', 'n']:
break
if a == 'y':
ytainfo.add(dbPath)
else:
ytainfo.createEmpty(dbPath)
t1 = time.time()
#Check if database needs upgrade
yta.upgradeDatabase(dbPath)
#Check if ID and language are specified
if not args.LANG or (not args.VIDEO and not args.file):
#Try reading playlist and language from database
try:
(args.LANG, args.VIDEO) = readInfoFromDB(dbPath)
except (sqlite3.Error, TypeError):
#Try reading playlist and language from files
try:
with open(os.path.join(path, "language"), 'r') as f:
args.LANG = f.readline().strip()
with open(os.path.join(path, "playlist"), 'r') as f:
args.VIDEO = f.readline().strip()
except (IndexError, OSError):
parser.error("LANG and VIDEO must be specified if no database exists.")
#Update lastupdate field
updateTimestamp = int(time.time())
db = yta.connectDB(dbPath)
db.execute("UPDATE channel SET lastupdate = ? WHERE id = 1", (updateTimestamp, ))
#Replace existing video
if args.replace:
try:
youtubeID = db.execute("SELECT youtubeID FROM videos WHERE youtubeID = ?;", (args.VIDEO,)).fetchone()[0]
assert youtubeID
except (sqlite3.Error, TypeError, AssertionError):
print("ERROR: Unable to replace video with ID \"{}\" as it is not in the archive database".format(args.VIDEO))
return
#Get format string
if args.quality:
q = args.quality
else:
q = db.execute("SELECT maxresolution FROM channel WHERE id=1;").fetchone()[0]
dlformat = yta.getFormatString(q)
#Prepare download
dlfilePath = os.path.join(path, "downloaded")
dbPath = os.path.join(path, "archive.db")
writeDownloadedFile(dbPath, dlfilePath, args.replace, args.VIDEO)
dlpath = os.path.join(path, "ID%(id)s&%(title)s.%(ext)s")
postHook = PostHook(args.LANG, db, args.check, args.replace)
#Set options
ytdlOpts = {"call_home": False, "quiet": False, "format": dlformat, "ignoreerrors": True, "download_archive": dlfilePath, "writesubtitles": True, "subtitleslangs": [args.LANG], "writedescription": True, "writethumbnail": True, "outtmpl": dlpath, "cachedir": False, "youtube_include_dash_manifest": True, "retries": 10, "fragment_retries": 25, "skip_unavailable_fragments": False, "continuedl": True, "extractor_args": {"youtube": {"player_client": ["android"]}}, "throttledratelimit": 100000, "allow_playlist_files": False, "post_hooks": [postHook.finished]}
ytdlOpts["postprocessors"] = [{"key": "FFmpegVideoConvertor", "preferedformat": "mp4"}, {"key": "FFmpegMetadata"}, {"key": "EmbedThumbnail","already_have_thumbnail": False}]
if args.filter:
ytdlOpts["match_filter"] = matchFilterFunc(args.filter)
#Check if archiving one video/playlist or using a batch file
if args.file:
with open(args.file, 'r', encoding="utf-8") as f:
url = readBatchURLs(f)
else:
url = [args.VIDEO]
#Prepare log
logFile = os.path.join(path, "log")
#Download
with DoubleLogger(logFile):
with yt_dlp.YoutubeDL(ytdlOpts) as ytdl:
ytdl.download(url)
#Print status
print("Download complete, updating database...")
#Update video number and totalsize
try:
db.execute("UPDATE channel SET videos = (SELECT count(id) FROM videos), totalsize = (SELECT sum(filesize) FROM videos) WHERE id = 1;")
except sqlite3.Error:
pass
#Update statistics
if args.statistics or args.captions or args.amendcaptions:
print("Updating video statistics...")
try:
ytameta.updateStatistics(db, updateTimestamp, args.captions, amendCaptions=args.amendcaptions)
except yta.NoAPIKeyError:
print("ERROR: Unable to update video statistics as no API key is available")
except RequestException as e:
print("ERROR: Unable to update video statistics due to connection error: \"{}\"".format(e))
#Close database
yta.closeDB(db)
#Print time
t2 = time.time()
print("DONE! Duration: " + yta.intervalToStr(t2-t1))
#Remove download archive file
try:
os.remove(dlfilePath)
except OSError:
pass
# ########################################################################### #
# --------------------------------------------------------------------------- #
def archiveAll(args):
'''Call archive script for all subdirs
:param args: The command line arguments given by the user
:type args: list
'''
#Set all to false for subsequent calls
args.all = False
#Set statistics to false for subsequent calls
updateStatistics = args.statistics
updateCaptions = args.captions
amendCaptions = args.amendcaptions
args.statistics = False
args.captions = False
args.amendcaptions = False
t1 = time.time()
#Get path
path = os.path.normpath(os.path.abspath(args.DIR))
#Check for progress file in directory
progressPath = os.path.join(path, "progress.json")
try:
with open(progressPath, 'r') as f:
progress = json.load(f)
if (t1 - progress["abortTime"]) > 3600:
progress = {"elapsed": 0}
os.remove(progressPath)
except OSError:
progress = {"elapsed": 0}
logFile = os.path.join(path, "log")
if "subdirs" in progress:
subdirs = progress["subdirs"]
counter = progress["counter"] - 1
channels = progress["channels"]
#Print message
if channels > 1:
print("CONTINUING ARCHIVING ALL {} CHANNELS IN \'{}\'\n".format(channels, path))
else:
#Get subdirs in path
subdirs = [os.path.join(path, name) for name in sorted(os.listdir(path), key=str.casefold) if os.path.isdir(os.path.join(path, name))]
subdirs = [sub for sub in subdirs if os.path.isfile(os.path.join(sub, "archive.db"))]
random.shuffle(subdirs)
if not subdirs:
print("ERROR: No subdirs with archive databases at \'{}\'".format(path))
return
#Prepare
channels = len(subdirs)
progress["channels"] = channels
counter = 0
with open(logFile, 'w') as f:
f.truncate()
#Print message
if channels > 1:
print("ARCHIVING ALL {} CHANNELS IN \'{}\'\n".format(channels, path))
#Initiate error log
errorLog = ""
#Loop through all subdirs
try:
t2 = time.time()
leftover = subdirs.copy()
for subdir in subdirs:
counter += 1
name = os.path.basename(os.path.normpath(subdir))
args.DIR = subdir
args.LANG = None
args.VIDEO = None
print("\nARCHIVING \'{}\' ({}/{})".format(name, counter, channels))
archive(args, True)
#Read errors from log
error = ""
with open(os.path.join(subdir, "log"), 'r') as f:
lines = f.readlines()
for i in range(len(lines)):
if lines[i].startswith("ERROR"):
error += "\n" + lines[i-1] + lines[i]
if error:
errorLog += '\n\n' + name + '\n' + error
leftover.remove(subdir)
except KeyboardInterrupt:
#Aborting, write progress file and log
t = time.time()
progress["counter"] = counter
progress["subdirs"] = leftover
progress["elapsed"] += t - t2
progress["abortTime"] = t
with open(progressPath, 'w') as f:
json.dump(progress, f)
if errorLog:
with open(logFile, 'a') as f:
f.writelines(errorLog)
#Rethrow exception
raise
#Progress file no longer relevant, removing it
try:
os.remove(progressPath)
except OSError:
pass
#Write error log
if not errorLog:
errorLog = "No errors\n"
with open(logFile, 'a') as f:
f.writelines(errorLog)
t3 = time.time()
#Check if statistics is set to autoupdate
autoUpdateStatistics = False
if not updateStatistics or updateCaptions:
try:
statsDB = yta.connectDB(os.path.join(path, "statistics.db"))
r = statsDB.execute("SELECT autoupdate FROM setup ORDER BY id DESC LIMIT 1;")
autoUpdateStatistics = bool(r.fetchone()[0])
del r
except sqlite3.Error:
pass
finally:
try:
yta.closeDB(statsDB)
except sqlite3.Error:
pass
#Update statistics
if updateStatistics or autoUpdateStatistics or updateCaptions or amendCaptions:
statTime = True
try:
ytameta.updateAllStatistics(path, autoUpdateStatistics, updateCaptions, amendCaptions)
except yta.NoAPIKeyError:
print("ERROR: Unable to update video statistics as no API key is available")
except RequestException as e:
print("ERROR: Unable to update video statistics due to connection error: \"{}\"".format(e))
else:
statTime = False
#Print time
t4 = time.time()
print("\nTotal runtime: {}\nArchive runtime: {}".format(yta.intervalToStr(progress["elapsed"]+(t4-t1)), yta.intervalToStr(progress["elapsed"]+(t3-t2))))
if statTime:
print("Statistic runtime: " + yta.intervalToStr(t4-t3))
print("\nDONE!")
# ########################################################################### #
# --------------------------------------------------------------------------- #
def writeDownloadedFile(dbPath, filePath, replace, videoID):
'''Write file containing Youtube IDs of all videos already archived
:param dbPath: Path of the archive database
:type dbPath: string
:param filePath: Path where the file containing all existing IDs should be written to
:type filePath: string
:param replace: Whether to replace the existing video in the archive database
:type replace: boolean
:param videoID: The new video id
:type videoID: string
'''
#Check if db exists
if not os.path.isfile(dbPath):
return
try:
with open(filePath, 'w+') as f:
#Connect to database
db = yta.connectDB(dbPath)
#Read IDs of all videos already in archive
r = db.execute("SELECT youtubeID FROM videos;")
for item in r.fetchall():
#Write IDs to file
if not (replace and videoID == item[0]):
f.write("youtube {}\n".format(item[0]))
yta.closeDB(db)
except sqlite3.Error:
return
# ########################################################################### #
# --------------------------------------------------------------------------- #
def readInfoFromDB(dbPath):
'''Read playlist and language from database
:param dbPath: Path of the archive database
:type dbPath: string
:raises: :class:``sqlite3.Error: Unable to read from database
:returns: List with language code at index 0 and playlist at index 1
:rtype: list of string
'''
db = yta.connectDB(dbPath)
r = db.execute("SELECT language,playlist FROM channel ORDER BY id DESC LIMIT 1;")
item = r.fetchone()
yta.closeDB(db)
return [item[0], item[1]]
# ########################################################################### #
# --------------------------------------------------------------------------- #
class DoubleLogger:
'''Double logger context
Get messages to stdout and stderr, print them to stdout and write them to a log
'''
def __init__(self, log):
'''Init
:param log: Location of the new log file
:type log: path-like
'''
self.oldout = sys.stdout
self.olderr = sys.stderr
self.term = sys.stdout
self.log = open(log, 'w+')
self.filter = re.compile(r'\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])')
def __enter__(self):
'''Enter context, start logging to stdout and log file'''
sys.stdout = self
sys.stderr = self
return self
def __exit__(self, exc_type, exc_value, traceback):
'''Exit context, restore stdout and stderr'''
sys.stdout = self.oldout
sys.stderr = self.olderr
self.close()
def write(self, msg):
'''Write a message to stdout and the log file
:param msg: The message
:type msg: string
'''
self.term.write(msg)
self.log.write(self.filter.sub('', msg))
def flush(self):
'''Flush the log file'''
self.log.flush()
def close(self):
'''Close the log file'''
self.log.close()
@staticmethod
def isatty():
'''Answer True to "Is a TTY?" question'''
return True
# ########################################################################### #
# --------------------------------------------------------------------------- #
if __name__ == "__main__":
try:
archive(sys.argv[1:])
except KeyboardInterrupt:
print("Aborted!")
# ########################################################################### #