This repository has been archived by the owner on Apr 2, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 9
/
derpibooru_dl.py
1422 lines (1305 loc) · 60.1 KB
/
derpibooru_dl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#-------------------------------------------------------------------------------
# Name: Derpibooru-dl
# Purpose:
#
# Author: woodenphone
#
# Created: 2014-02-88
# Copyright: (c) new 2014
# Licence: <your licence>
#-------------------------------------------------------------------------------
#!/usr/bin/env python
import time
import os
import sys
import re
import mechanize
import cookielib
import logging
import urllib2
import httplib
import random
import glob
import ConfigParser
import HTMLParser
import json
import shutil
import pickle
import socket
import hashlib
import string
import argparse
import derpibooru
# getwithinfo()
GET_REQUEST_DELAY = 0
GET_RETRY_DELAY = 30# [19:50] <@CloverTheClever> Ctrl-S: if your downloader gets a connection error, sleep 10 and increase delay between attempts by a second
GET_MAX_ATTEMPTS = 10
def setup_logging(log_file_path):
# Setup logging (Before running any other code)
# http://inventwithpython.com/blog/2012/04/06/stop-using-print-for-debugging-a-5-minute-quickstart-guide-to-pythons-logging-module/
assert( len(log_file_path) > 1 )
assert( type(log_file_path) == type("") )
global logger
# Make sure output dir exists
log_file_folder = os.path.dirname(log_file_path)
if log_file_folder is not None:
if not os.path.exists(log_file_folder):
os.makedirs(log_file_folder)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
fh = logging.FileHandler(log_file_path)
fh.setLevel(logging.DEBUG)
fh.setFormatter(formatter)
logger.addHandler(fh)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
ch.setFormatter(formatter)
logger.addHandler(ch)
logging.debug("Logging started.")
return
def add_http(url):
"""Ensure a url starts with http://..."""
if "http://" in url:
return url
elif "https://" in url:
return url
else:
#case //derpicdn.net/img/view/...
first_two_chars = url[0:2]
if first_two_chars == "//":
output_url = "https:"+url
return output_url
else:
logging.error(repr(locals()))
raise ValueError
def deescape(html):
# de-escape html
# http://stackoverflow.com/questions/2360598/how-do-i-unescape-html-entities-in-a-string-in-python-3-1
deescaped_string = HTMLParser.HTMLParser().unescape(html)
return deescaped_string
def get(url):
#try to retreive a url. If unable to return None object
#Example useage:
#html = get("")
#if html:
assert_is_string(url)
deescaped_url = deescape(url)
url_with_protocol = add_http(deescaped_url)
#logging.debug( "getting url ", locals())
gettuple = getwithinfo(url_with_protocol)
if gettuple:
reply, info = gettuple
return reply
else:
return
def getwithinfo(url):
"""Try to retreive a url. If unable to return None objects
Example useage:
html = get("")
if html:
"""
attemptcount = 0
while attemptcount < GET_MAX_ATTEMPTS:
attemptcount = attemptcount + 1
if attemptcount > 1:
delay(GET_RETRY_DELAY)
logging.debug( "Attempt "+repr(attemptcount)+" for URL: "+repr(url) )
try:
save_file(os.path.join("debug","get_last_url.txt"), url, True)
r = br.open(url, timeout=100)
info = r.info()
reply = r.read()
delay(GET_REQUEST_DELAY)
# Save html responses for debugging
#print info
#print info["content-type"]
if "html" in info["content-type"]:
#print "saving debug html"
save_file(os.path.join("debug","get_last_html.htm"), reply, True)
else:
save_file(os.path.join("debug","get_last_not_html.txt"), reply, True)
# Retry if empty response and not last attempt
if (len(reply) < 1) and (attemptcount < GET_MAX_ATTEMPTS):
logging.error("Reply too short :"+repr(reply))
continue
return reply,info
except urllib2.HTTPError, err:
logging.debug(repr(err))
if err.code == 404:
logging.debug("404 error! "+repr(url))
return
elif err.code == 403:
logging.debug("403 error, ACCESS DENIED! url: "+repr(url))
return
elif err.code == 410:
logging.debug("410 error, GONE")
return
else:
save_file(os.path.join("debug","HTTPError.htm"), err.fp.read(), True)
continue
except urllib2.URLError, err:
logging.debug(repr(err))
if "unknown url type:" in err.reason:
return
else:
continue
except httplib.BadStatusLine, err:
logging.debug(repr(err))
continue
except httplib.IncompleteRead, err:
logging.debug(repr(err))
continue
except mechanize.BrowserStateError, err:
logging.debug(repr(err))
continue
except socket.timeout, err:
logging.debug(repr( type(err) ) )
logging.debug(repr(err))
continue
logging.critical("Too many repeated fails, exiting.")
sys.exit()# [19:51] <@CloverTheClever> if it does it more than 10 times, quit/throw an exception upstream
def save_file(filenamein,data,force_save=False):
if not force_save:
if os.path.exists(filenamein):
logging.debug("file already exists! "+repr(filenamein))
return
sanitizedpath = filenamein# sanitizepath(filenamein)
foldername = os.path.dirname(sanitizedpath)
if len(foldername) >= 1:
if not os.path.isdir(foldername):
os.makedirs(foldername)
file = open(sanitizedpath, "wb")
file.write(data)
file.close()
return
def delay(basetime,upperrandom=0):
#replacement for using time.sleep, this adds a random delay to be sneaky
sleeptime = basetime + random.randint(0,upperrandom)
#logging.debug("pausing for "+repr(sleeptime)+" ...")
time.sleep(sleeptime)
def crossplatform_path_sanitize(path_to_sanitize,remove_repeats=False):
"""Take a desired file path and chop away at it until it fits all platforms path requirements"""
# Remove disallowed characters
# http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247%28v=vs.85%29.aspx
windows_bad_chars = """/\\"""
nix_bad_carhs = """/"""
all_bad_chars = set(windows_bad_chars)+set(nix_bad_carhs)
if remove_repeats:
# Remove repeated characters, such as hyphens or spaces
pass
# Shorten if above filepath length limits
windows_max_filepath_length = 255
nix_max_filepath_length = None
# Ensure first and last characters of path segments are not whitespace
path_segments = []
def import_list(listfilename="ERROR.txt"):
"""Read in a text file, return each line as a string in a list"""
if os.path.exists(listfilename):# Check if there is a list
query_list = []# Make an empty list
list_file = open(listfilename, "rU")
for line in list_file:
if line[0] != "#" and line[0] != "\n":# Skip likes starting with '#' and the newline character
if line[-1] == "\n":# Remove trailing newline if it exists
stripped_line = line[:-1]
else:
stripped_line = line# If no trailing newline exists, we dont need to strip it
query_list.append(stripped_line)# Add the username to the list
list_file.close()
return query_list
else: # If there is no list, make one
new_file_text = ("# Add one query per line, Full derpibooru search syntax MAY be available. Enter queries exactly as you would on the site.\n"
+ "# Any line that starts with a hash symbol (#) will be ignored.\n"
+ "# Search syntax help is available at https://derpibooru.org/search/syntax \n"
+ "# Example 1: -(pinkamena, +grimdark)\n"
+ "# Example 2: reversalis")
list_file = open(listfilename, "w")
list_file.write(new_file_text)
list_file.close()
return []
def append_list(lines,list_file_path="done_list.txt",initial_text="# List of completed items.\n",overwrite=False):
# Append a string or list of strings to a file; If no file exists, create it and append to the new file.
# Strings will be seperated by newlines.
# Make sure we're saving a list of strings.
if ((type(lines) is type(""))or (type(lines) is type(u""))):
lines = [lines]
# Ensure file exists and erase if needed
if (not os.path.exists(list_file_path)) or (overwrite is True):
list_file_segments = os.path.split(list_file_path)
list_dir = list_file_segments[0]
if list_dir:
if not os.path.exists(list_dir):
os.makedirs(list_dir)
nf = open(list_file_path, "w")
nf.write(initial_text)
nf.close()
# Write data to file.
f = open(list_file_path, "a")
for line in lines:
outputline = line+"\n"
f.write(outputline)
f.close()
return
class config_handler():
def __init__(self,settings_path):
self.settings_path = settings_path
# Make sure settings folder exists
settings_folder = os.path.dirname(self.settings_path)
if settings_folder is not None:
if not os.path.exists(settings_folder):
os.makedirs(settings_folder)
# Setup settings, these are static
self.set_defaults()
self.load_file(self.settings_path)
self.save_settings(self.settings_path)
self.handle_command_line_arguments()
# Setup things that can change during program use
self.load_deleted_submission_list()# list of submissions that are known to have been deleted
return
def set_defaults(self):
"""Set the defaults for settings, these will be overridden by settings from a file"""
# derpibooru_dl.py
# Login
self.api_key = "Replace_this_with_your_API_key"
# Download Settings
self.reverse = False
self.output_folder = "download"# Root path to download to
self.download_submission_ids_list = True
self.download_query_list = True
self.output_long_filenames = False # Should we use the derpibooru supplied filename with the tags? !UNSUPPORTED!
self.input_list_path = os.path.join("config","derpibooru_dl_tag_list.txt")
self.done_list_path = os.path.join("config","derpibooru_done_list.txt")
self.failed_list_path = os.path.join("config","derpibooru_failed_list.txt")
self.save_to_query_folder = True # Should we save to multiple folders?
self.skip_downloads = False # Don't retrieve remote submission files after searching
self.sequentially_download_everything = False # download submission 1,2,3...
self.go_backwards_when_using_sequentially_download_everything = False # when downloading everything in range mode should we go 10,9,8,7...?
self.download_last_week = False # Download (approximately) the last weeks submissions
self.skip_glob_duplicate_check = False # Skip glob.glob based duplicate check (only check if output file exists instead of scanning all output paths)
self.skip_known_deleted = True # Skip submissions of the list of known deleted IDs
self.deleted_submissions_list_path = os.path.join("config","deleted_submissions.txt")
self.move_on_fail_verification = False # Should files be moved if verification of a submission fails?
self.save_comments = False # Should comments be saved, uses more resources.
# General settings
self.show_menu = True # Should the text based menu system be used?
self.hold_window_open = True # Should the window be kept open after all tasks are done?
# Internal variables, these are set through this code only
self.resume_file_path = os.path.join("config","resume.pkl")
self.pointer_file_path = os.path.join("config","dl_everything_pointer.pkl")
self.filename_prefix = "derpi_"
self.sft_max_attempts = 10 # Maximum retries in search_for_tag()
self.max_search_page_retries = 10 # maximum retries for a search page
self.combined_download_folder_name = "combined_downloads"# Name of subfolder to use when saving to only one folder
self.max_download_attempts = 10 # Number of times to retry a download before skipping
self.verification_fail_output_path = "failed_verification"
return
def load_file(self,settings_path):
"""Load settings from a file"""
config = ConfigParser.RawConfigParser()
if not os.path.exists(settings_path):
return
config.read(settings_path)
# derpibooru_dl.py
# Login
try:
self.api_key = config.get("Login", "api_key")
except ConfigParser.NoOptionError:
pass
# Download Settings
try:
self.reverse = config.getboolean("Download", "reverse")
except ConfigParser.NoOptionError:
pass
try:
self.output_folder = config.get("Download", "output_folder")
except ConfigParser.NoOptionError:
pass
try:
self.download_submission_ids_list = config.getboolean("Download", "download_submission_ids_list")
except ConfigParser.NoOptionError:
pass
try:
self.download_query_list = config.getboolean("Download", "download_query_list")
except ConfigParser.NoOptionError:
pass
try:
self.output_long_filenames = config.getboolean("Download", "output_long_filenames")
except ConfigParser.NoOptionError:
pass
try:
self.input_list_path = config.get("Download", "input_list_path")
except ConfigParser.NoOptionError:
pass
try:
self.done_list_path = config.get("Download", "done_list_path")
except ConfigParser.NoOptionError:
pass
try:
self.failed_list_path = config.get("Download", "failed_list_path")
except ConfigParser.NoOptionError:
pass
try:
self.save_to_query_folder = config.getboolean("Download", "save_to_query_folder")
except ConfigParser.NoOptionError:
pass
try:
self.skip_downloads = config.getboolean("Download", "skip_downloads")
except ConfigParser.NoOptionError:
pass
try:
self.sequentially_download_everything = config.getboolean("Download", "sequentially_download_everything")
except ConfigParser.NoOptionError:
pass
try:
self.go_backwards_when_using_sequentially_download_everything = config.getboolean("Download", "go_backwards_when_using_sequentially_download_everything")
except ConfigParser.NoOptionError:
pass
try:
self.download_last_week = config.getboolean("Download", "download_last_week")
except ConfigParser.NoOptionError:
pass
try:
self.skip_glob_duplicate_check = config.getboolean("Download", "skip_glob_duplicate_check")
except ConfigParser.NoOptionError:
pass
try:
self.skip_known_deleted = config.getboolean("Download", "skip_known_deleted")
except ConfigParser.NoOptionError:
pass
try:
self.deleted_submissions_list_path = config.get("Download", "deleted_submissions_list_path")
except ConfigParser.NoOptionError:
pass
try:
self.move_on_fail_verification = config.getboolean("Download", "move_on_fail_verification")
except ConfigParser.NoOptionError:
pass
try:
self.save_comments = config.getboolean("Download", "save_comments")
except ConfigParser.NoOptionError:
pass
# General settings
try:
self.show_menu = config.getboolean("General", "show_menu")
except ConfigParser.NoOptionError:
pass
try:
self.hold_window_open = config.getboolean("General", "hold_window_open")
except ConfigParser.NoOptionError:
pass
return
def save_settings(self,settings_path):
"""Save settings to a file"""
config = ConfigParser.RawConfigParser()
config.add_section("Login")
config.set("Login", "api_key", self.api_key )
config.add_section("Download")
config.set("Download", "reverse", str(self.reverse) )
config.set("Download", "output_folder", self.output_folder )
config.set("Download", "download_submission_ids_list", str(self.download_submission_ids_list) )
config.set("Download", "download_query_list", str(self.download_query_list) )
config.set("Download", "output_long_filenames", str(self.output_long_filenames) )
config.set("Download", "input_list_path", self.input_list_path )
config.set("Download", "done_list_path", self.done_list_path )
config.set("Download", "failed_list_path", self.failed_list_path )
config.set("Download", "save_to_query_folder", str(self.save_to_query_folder) )
config.set("Download", "skip_downloads", str(self.skip_downloads) )
config.set("Download", "sequentially_download_everything", str(self.sequentially_download_everything) )
config.set("Download", "go_backwards_when_using_sequentially_download_everything", str(self.go_backwards_when_using_sequentially_download_everything) )
config.set("Download", "download_last_week", str(self.download_last_week) )
config.set("Download", "skip_glob_duplicate_check", str(self.skip_glob_duplicate_check) )
config.set("Download", "skip_known_deleted", str(self.skip_known_deleted) )
config.set("Download", "deleted_submissions_list_path", str(self.deleted_submissions_list_path) )
config.set("Download", "move_on_fail_verification", str(self.move_on_fail_verification) )
config.set("Download", "save_comments", str(self.save_comments) )
config.add_section("General")
config.set("General", "show_menu", str(self.show_menu) )
config.set("General", "hold_window_open", str(self.hold_window_open) )
with open(settings_path, "wb") as configfile:
config.write(configfile)
return
def handle_command_line_arguments(self):
"""Handle any command line arguments"""
parser = argparse.ArgumentParser(description="DESCRIPTION FIELD DOES WHAT?")
# Define what arguments are allowed
menu_group = parser.add_mutually_exclusive_group()
menu_group.add_argument("-m", "--menu", action="store_true",help="Show text based menu.")# Show text based menu
menu_group.add_argument("-b", "--batch", action="store_true",help="Run in batch mode.")# Use batch mode
parser.add_argument("-k", "--api_key",help="API Key.")
parser.add_argument("-ids", "--download_submission_ids_list",help="download_submission_ids_list")
parser.add_argument("-queries", "--download_query_list",help="download_query_list")
parser.add_argument("-longfn", "--output_long_filenames",help="output_long_filenames")
parser.add_argument("-qf", "--save_to_query_folder",help="save_to_query_folder")
parser.add_argument("-skip", "--skip_downloads",help="skip_downloads")
parser.add_argument("--sequentially_download_everything",help="sequentially_download_everything")
parser.add_argument("--go_backwards_when_using_sequentially_download_everything",help="go_backwards_when_using_sequentially_download_everything")
parser.add_argument("-ilp", "--input_list_path",help="input_list_path")
parser.add_argument("--save_args_to_settings", action="store_true")# Write new settings to file
# Store arguments to settings
args = parser.parse_args()
if args.menu:
self.show_menu = True
elif args.batch:
self.show_menu = False
if args.api_key:
self.api_key = args.api_key
if args.download_submission_ids_list:
self.download_submission_ids_list = args.download_submission_ids_list
if args.download_query_list:
self.download_query_list = args.download_query_list
if args.output_long_filenames:
self.output_long_filenames = args.output_long_filenames
if args.save_to_query_folder:
self.save_to_query_folder = args.save_to_query_folder
if args.skip_downloads:
self.skip_downloads = args.skip_downloads
if args.sequentially_download_everything:
self.sequentially_download_everything = args.sequentially_download_everything
if args.go_backwards_when_using_sequentially_download_everything:
self.go_backwards_when_using_sequentially_download_everything = args.go_backwards_when_using_sequentially_download_everything
if args.input_list_path:
self.input_list_path = args.input_list_path
# Write to settings file if needed. Must be done last
if args.save_args_to_settings:
self.save_settings()
return
def load_deleted_submission_list(self):
"""Load a list of known bad IDs from a file"""
self.deleted_submissions_list = import_list(listfilename=self.deleted_submissions_list_path)
return self.deleted_submissions_list
def update_deleted_submission_list(self,submission_id):
"""Add a bad ID to the list in both ram and disk"""
self.deleted_submissions_list.append(submission_id)
append_list(submission_id, list_file_path=self.deleted_submissions_list_path, initial_text="# List of deleted IDs.\n", overwrite=False)
return
def assert_is_string(object_to_test):
"""Make sure input is either a string or a unicode string"""
if( (type(object_to_test) == type("")) or (type(object_to_test) == type(u"")) ):
return
logging.critical(repr(locals()))
raise(ValueError)
def decode_json(json_string):
"""Wrapper for JSON decoding
Return None object if known problem case occurs
Return decoded data if successful
Reraise unknown cases for caught exceptions"""
assert_is_string(json_string)
try:
save_file(os.path.join("debug","last_json.json"), json_string, True)
json_data = json.loads(json_string)
return json_data
except ValueError, err:
# Retry if bad json recieved
if "Unterminated string starting at:" in repr(err):
logging.debug("JSON data invalid, failed to decode.")
logging.debug(repr(json_string))
return
elif "No JSON object could be decoded" in repr(err):
if len(json_string) < 20:
logging.debug("JSON string was too short!")
logging.debug(repr(json_string))
return
else:
logging.critical(repr(locals()))
raise(err)
# Log locals and crash if unknown issue
else:
logging.critical(repr(locals()))
raise(err)
def read_file(path):
"""grab the contents of a file"""
f = open(path, "r")
data = f.read()
f.close()
return data
def setup_browser():
#Initialize browser object to global variable "br" using cokie jar "cj"
# Browser
global br
br = mechanize.Browser()
br.set_cookiejar(cj)
# Browser options
br.set_handle_equiv(True)
br.set_handle_gzip(True)
br.set_handle_redirect(True)
br.set_handle_referer(True)
br.set_handle_robots(False)
# Follows refresh 0 but not hangs on refresh > 0
br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
# User-Agent (this is cheating, ok?)
#br.addheaders = [("User-agent", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1")]
#br.addheaders = [("User-agent", "Trixie is worst pony")]#[13:57] <%barbeque> as long as it's not something like "trixie is worst pony"
#print "trixie is worst pony"
br.addheaders = [("User-agent", "derpibooru_dl.py - https://github.com/woodenphone/Derpibooru-dl")] # Let's make it easy for the admins to see us so if something goes wrong we'll find out about it.
return
def search_for_query(settings,search_query):
"""Perform search for a query on derpibooru.
Return a lost of found submission IDs"""
assert_is_string(search_query)
logging.debug("Starting search for query: "+repr(search_query))
found_submissions = []
for image in derpibooru.Search().key(settings.api_key).limit(None).query(search_query):
found_submissions.append(image.id)
return found_submissions
def check_if_deleted_submission(json_dict):
"""Check whether the JSON Dict for a submission shows it as being deleted"""
keys = json_dict.keys()
if "deletion_reason" in keys:
logging.error("Deleted submission! Reason: "+repr(json_dict["deletion_reason"]))
return True
elif "duplicate_of" in keys:
logging.error("Deleted duplicate submission! Reason: "+repr(json_dict["duplicate_of"]))
return True
else:
return False
def copy_over_if_duplicate(settings,submission_id,output_folder):
"""Check if there is already a copy of the submission downloaded in the download path.
If there is, copy the existing version to the suppplied output location then return True
If no copy can be found, return False"""
assert_is_string(submission_id)
# Setting to override this function for speed optimisation on single folder output
if settings.skip_glob_duplicate_check:
return False
# Generate expected filename pattern
submission_filename_pattern = "*"+submission_id+".*"
# Generate search pattern
glob_string = os.path.join(settings.output_folder, "*", submission_filename_pattern)
# Use glob to check for existing files matching the expected pattern
#logging.debug("CALLING glob.glob, local vars: "+ repr(locals()))
glob_matches = glob.glob(glob_string)
#logging.debug("CALLED glob.glob, locals: "+repr(locals()))
# Check if any matches, if no matches then return False
if len(glob_matches) == 0:
return False
else:
# If there is an existing version:
for glob_match in glob_matches:
# Skip any submission with the wrong ID
match_submission_id = find_id_from_filename(settings, glob_match)
if match_submission_id != submission_id:
continue
# If there is an existing version in the output path, nothing needs to be copied
if output_folder in glob_match:
return False
else:
# Copy over submission file and metadata JSON
logging.info("Trying to copy from previous download: "+repr(glob_match))
# Check output folders exist
# Build expected paths
match_dir, match_filename = os.path.split(glob_match)
expected_json_input_filename = submission_id+".json"
expected_json_input_folder = os.path.join(match_dir, "json")
expected_json_input_location = os.path.join(expected_json_input_folder, expected_json_input_filename)
json_output_folder = os.path.join(output_folder, "json")
json_output_filename = submission_id+".json"
json_output_path = os.path.join(json_output_folder, json_output_filename)
submission_output_path = os.path.join(output_folder,match_filename)
# Redownload if a file is missing
if not os.path.exists(glob_match):
logging.debug("Submission file to copy is missing.")
return False
if not os.path.exists(expected_json_input_location):
logging.debug("JSON file to copy is missing.")
return False
# Ensure output path exists
if not os.path.exists(json_output_folder):
os.makedirs(json_output_folder)
if not os.path.exists(output_folder):
os.makedirs(output_folder)
logging.info("Copying files for submission: "+repr(submission_id)+" from "+repr(match_dir)+" to "+repr(output_folder))
# Copy over files
try:
# Copy submission file
shutil.copy2(glob_match, submission_output_path)
# Copy JSON
shutil.copy2(expected_json_input_location, json_output_path)
return True
except IOError, err:
logging.error("Error copying files!")
logging.exception(err)
return False
def download_submission(settings,search_query,submission_id):
"""Download a submission from Derpibooru"""
assert_is_string(search_query)
submission_id = str(submission_id)
setup_browser()
query_for_filename = convert_query_for_path(settings,search_query)
#logging.debug("Downloading submission:"+submission_id)
# Build JSON paths
json_output_filename = submission_id+".json"
if settings.save_to_query_folder is True:
json_output_path = os.path.join(settings.output_folder,query_for_filename,"json",json_output_filename)
else:
# Option to save to a single combined folder
json_output_path = os.path.join(settings.output_folder,settings.combined_download_folder_name,"json",json_output_filename)
# Check if download can be skipped
# Check if JSON exists
if os.path.exists(json_output_path):
logging.debug("JSON for this submission already exists, skipping.")
return
# Build output folder path
if settings.save_to_query_folder is True:
output_folder = os.path.join(settings.output_folder,query_for_filename)
else:
# Option to save to a single combined folder
output_folder = os.path.join(settings.output_folder,settings.combined_download_folder_name)
# Check for dupliactes in download folder
if copy_over_if_duplicate(settings, submission_id, output_folder):
return
# Option to skip loading remote submission files
if settings.skip_downloads is True:
return
# Option to skip previously encountered deleted submissions
if settings.skip_known_deleted:
if submission_id in settings.deleted_submissions_list:
return
# Build JSON URL
# Option to save comments, uses more resources.
if settings.save_comments:
json_url = "https://derpibooru.org/"+submission_id+".json?comments=true&key="+settings.api_key
else:
json_url = "https://derpibooru.org/"+submission_id+".json?key="+settings.api_key
# Retry if needed
download_attempt_counter = 0
while download_attempt_counter <= settings.max_download_attempts:
download_attempt_counter += 1
if download_attempt_counter > 1:
logging.debug("Attempt "+repr(download_attempt_counter))
# Load JSON URL
json_page = get(json_url)
if not json_page:
continue
# Convert JSON to dict
json_dict = decode_json(json_page)
if json_dict is None:
continue
# Check if submission is deleted
if check_if_deleted_submission(json_dict):
logging.debug("Submission was deleted.")
logging.debug(repr(json_page))
settings.update_deleted_submission_list(submission_id)
return
# Extract needed info from JSON
image_url = json_dict["image"]
image_file_ext = json_dict["original_format"]
image_height = json_dict["height"]
image_width = json_dict["width"]
# Build image output filenames
if settings.output_long_filenames:
# Grab the filename from the url by throwing away everything before the last forwardslash
image_filename_crop_regex = """.+\/(.+)"""
image_filename_search = re.search(image_filename_crop_regex, image_url, re.IGNORECASE|re.DOTALL)
image_filename = image_filename_search.group(1)
image_output_filename = settings.filename_prefix+image_filename+"."+image_file_ext
else:
image_output_filename = settings.filename_prefix+submission_id+"."+image_file_ext
image_output_path = os.path.join(output_folder,image_output_filename)
# Load image data
authenticated_image_url = image_url+"?key="+settings.api_key
logging.debug("Loading submission image. Height:"+repr(image_height)+", Width:"+repr(image_width)+", URL: "+repr(authenticated_image_url))
image_data = get(authenticated_image_url)
if not image_data:
return
# Image should always be bigger than this, if it isn't we got a bad file
if len(image_data) < 100:
logging.error("Image data was too small! "+repr(image_data))
continue
# Save image
save_file(image_output_path, image_data, True)
# Save JSON
save_file(json_output_path, json_page, True)
logging.debug("Download successful")
return
logging.error("Too many retries, skipping this submission.")
logging.debug(repr(locals()))
return
def read_pickle(file_path):
file_data = read_file(file_path)
pickle_data = pickle.loads(file_data)
return pickle_data
def save_pickle(path,data):
# Save data to pickle file
# Ensure folder exists.
if not os.path.exists(path):
pickle_path_segments = os.path.split(path)
pickle_dir = pickle_path_segments[0]
if pickle_dir:# Make sure we aren't at the script root
if not os.path.exists(pickle_dir):
os.makedirs(pickle_dir)
pf = open(path, "wb")
pickle.dump(data, pf)
pf.close()
return
def save_resume_file(settings,search_tag,submission_ids):
# Save submissionIDs and search_tag to pickle
logging.debug("Saving resume data pickle")
# {"search_tag":"FOO", "submission_ids":["1","2"]}
# Build dict
resume_dict = {
"search_tag":search_tag,
"submission_ids":submission_ids
}
save_pickle(settings.resume_file_path, resume_dict)
return
def clear_resume_file(settings):
# Erase pickle
logging.debug("Erasing resume data pickle")
if os.path.exists(settings.resume_file_path):
os.remove(settings.resume_file_path)
return
def resume_downloads(settings):
# Look for pickle of submissions to iterate over
if os.path.exists(settings.resume_file_path):
logging.debug("Resuming from pickle")
# Read pickle:
resume_dict = read_pickle(settings.resume_file_path)
search_tag = resume_dict["search_tag"]
submission_ids = resume_dict["submission_ids"]
# Iterate over submissions
download_submission_id_list(settings,submission_ids,search_tag)
# Clear temp file
clear_resume_file(settings)
append_list(search_tag, settings.done_list_path)
return search_tag
else:
return False
def download_submission_id_list(settings,submission_ids,query):
# Iterate over submissions
submission_counter = 0
# If no submissions to save record failure
if len(submission_ids) == 0:
logging.warning("No submissions to save! Query:"+repr(query))
append_list(query, settings.failed_list_path, initial_text="# List of failed items.\n")
if settings.reverse:
logging.info("Reverse mode is active, reversing download order.")
submission_ids.reverse()
for submission_id in submission_ids:
submission_counter += 1
# Only save pickle every 1000 items to help avoid pickle corruption
if (submission_counter % 1000) == 0:
cropped_submission_ids = submission_ids[( submission_counter -1 ):]
save_resume_file(settings,query,cropped_submission_ids)
logging.info("Now working on submission "+repr(submission_counter)+" of "+repr(len(submission_ids) )+" : "+repr(submission_id)+" for: "+repr(query) )
# Try downloading each submission
download_submission(settings, query, submission_id)
print "\n\n"
return
def save_pointer_file(settings,start_number,finish_number):
"""Save start and finish numbers to pickle"""
logging.debug("Saving resume data pickle")
# {"start_number":0, "finish_number":100}
# Build dict
resume_dict = {
"start_number":start_number,
"finish_number":finish_number
}
save_pickle(settings.pointer_file_path, resume_dict)
return
def clear_pointer_file(settings):
"""Erase range download pickle"""
logging.debug("Erasing resume data pickle")
if os.path.exists(settings.pointer_file_path):
os.remove(settings.pointer_file_path)
return
def get_latest_submission_id(settings):
"""Find the most recent submissions ID"""
logging.debug("Getting ID of most recent submission...")
latest_submissions = []
for image in derpibooru.Search().key(settings.api_key):
submission_id = image.id
latest_submissions.append(submission_id)
ordered_latest_submissions = sorted(latest_submissions)
latest_submission_id = int(ordered_latest_submissions[0])
logging.debug("Most recent submission ID:"+repr(latest_submission_id))
return latest_submission_id
def download_this_weeks_submissions(settings):
"""Download (about) one weeks worth of the most recent submissions"""
logging.info("Now downloading the last week's submissions.")
# Get starting number
latest_submission_id = get_latest_submission_id(settings)
# Calculate ending number
one_weeks_submissions_number = 1000 * 7 # less than 1000 per day
finish_number = latest_submission_id - one_weeks_submissions_number # Add a thousand to account for new submissions added during run
logging.info("Downloading the last "+repr(one_weeks_submissions_number)+" submissions. Starting at "+repr(latest_submission_id)+" and stopping at "+repr(finish_number))
download_range(settings,latest_submission_id,finish_number)
return
def download_everything(settings):
"""Start downloading everything or resume downloading everything"""
logging.info("Now downloading all submissions on the site")
# Start downloading everything
latest_submission_id = get_latest_submission_id(settings)
start_number = 0
finish_number = latest_submission_id + 50000 # Add 50,000 to account for new submissions added during run
if settings.go_backwards_when_using_sequentially_download_everything:
# Swap start and finish numbers for backwards mode
start_number, finish_number = latest_submission_id, start_number
download_range(settings,start_number,finish_number)
return
def resume_range_download(settings):
# Look for pickle of range to iterate over
if os.path.exists(settings.pointer_file_path):
logging.info("Resuming range from pickle")
# Read pickle:
resume_dict = read_pickle(settings.pointer_file_path)
start_number = resume_dict["start_number"]
finish_number = resume_dict["finish_number"]
# Iterate over range
download_range(settings,start_number,finish_number)
return
def download_range(settings,start_number,finish_number):
"""Try to download every submission within a given range
If finish number is less than start number, run over the range backwards"""
# If starting point is after end point, we're going backwards
if(start_number > finish_number):
backwards = True
else:
backwards = False
assert(finish_number <= 2000000)# less than 2 million, 1,252,291 submissions as of 2016-09-18
assert(start_number >= 0)# First submission is ID 0
assert(type(finish_number) is type(1))# Must be integer
assert(type(start_number) is type(1))# Must be integer
total_submissions_to_attempt = abs(finish_number - start_number)
logging.info("Downloading range: "+repr(start_number)+" to "+repr(finish_number))
# Iterate over range of id numbers
submission_pointer = start_number
loop_counter = 0
while (loop_counter <= total_submissions_to_attempt ):
loop_counter += 1
assert(submission_pointer >= 0)# First submission is ID 0
assert(submission_pointer <= 2000000)# less than 2 million, 1,252,291 submissions as of 2016-09-18
assert(type(submission_pointer) is type(1))# Must be integer
# Only save pickle every 1000 items to help avoid pickle corruption
if (submission_pointer % 1000) == 0:
save_pointer_file(settings, submission_pointer, finish_number)
logging.info("Now working on submission "+repr(loop_counter)+" of "+repr(total_submissions_to_attempt)+", ID: "+repr(submission_pointer)+" for range download mode" )
# Try downloading each submission
download_submission(settings, "RANGE_MODE", submission_pointer)
print "\n\n"
# Add/subtract from counter depending on mode
if backwards:
submission_pointer -= 1
else:
submission_pointer += 1
# Clean up once everything is done
clear_pointer_file(settings)
return
def download_ids(settings,query_list,folder):
logging.info("Now downloading user set IDs.")
submission_ids = []
for query in query_list:
# remove invalid items
if re.search("[^\d]",query):
logging.debug("Not a submissionID! skipping.")
continue
else:
submission_ids.append(query)
download_submission_id_list(settings,submission_ids,folder)
return
def process_query(settings,search_query):
"""Download submissions for a tag on derpibooru"""
assert_is_string(search_query)
#logging.info("Processing tag: "+search_query)
# Run search for query
submission_ids = search_for_query(settings, search_query)
# Save data for resuming
if len(submission_ids) > 0:
save_resume_file(settings,search_query,submission_ids)
# Download all found items
download_submission_id_list(settings,submission_ids,search_query)
# Clear temp data
clear_resume_file(settings)
return
def download_query_list(settings,query_list):