-
Notifications
You must be signed in to change notification settings - Fork 1
/
new-subs-archive.py
executable file
·1234 lines (1071 loc) · 41.5 KB
/
new-subs-archive.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#! /usr/bin/env python3
# TODO verify all 404 URLs
# TODO rename table to zipfiles
# TODO add table missing_404
# TODO add table missing_dcma
# folders with zip files
input_dir_long_filenames = "new-subs"
# folders with zip files
# short name format
# for output_format = "iso"
input_dir_short_filenames = "new-subs-num"
# use short filenames in archive
# short: 1.zip
# long : 1.alien.3.(1992).eng.2cd.zip
# short filenames are better for lookup by sub_number
# about 10x faster than glob (readdir + regex)
# for output_format = "iso"
use_short_filenames = True
repeat_count = 2
# verbose
debug_print = print
# quiet
#debug_print = lambda _: None
# not mountable, slow random access
# good for small releases with append-only?
#output_format = "tar"
# harder to use than sqlite?
#output_format = "iso"
# requires mount to write files? not reproducible: cannot set file times to zero
#output_format = "udf"
# only UDF version 2.60
#output_format = "udf-pycdlib"
# sqlite is reproducible by default. nice!
# but: file header changes when we append data
# at offset 28 + 4 bytes:
# Size of the database file in pages. The "in-header database size".
# https://www.sqlite.org/fileformat2.html#database_header
output_format = "sqlite"
# big releases (monthly)
sqlite_group_by_language = True
# small releases (daily)
#sqlite_group_by_language = False
if output_format == "tar":
use_short_filenames = False
repeat_count = 1 # we already know its reproducible
if output_format == "sqlite":
use_short_filenames = False
store_filenames = False # dont create filenames.txt
repeat_count = 1 # we already know its reproducible
# create filenames.txt with the full filenames
store_filenames = use_short_filenames
# sqlite page size in bytes
# average file size is 20KB
# TODO benchmark
# re-create db with all page sizes:
# for page_size in 512 1024 2048 4096 8192 16384 32768 65536; do ( echo "PRAGMA page_size=$page_size;"; sqlite3 src.db .dump; ) | sqlite3 dst.pagesize-$page_size.db; done
"""
sqlite_page_size = 2**9 # 512 = min
sqlite_page_size = 2**10 # 1024 = 1K
sqlite_page_size = 2**11 # 2048 = 2K
sqlite_page_size = 2**12 # 4096 = 4K = default
sqlite_page_size = 2**13 # 8192 = 8K
sqlite_page_size = 2**14 # 16384 = 16K
sqlite_page_size = 2**15 # 32768 = 32K
sqlite_page_size = 2**16 # 65536 = 64K = max
"""
sqlite_page_size = 2**12 # 4096 = 4K = default
# benchmark
sqlite_compare_page_sizes = False
sqlite_page_sizes = [
2**9, # 512B = min
#2**10, # 1KiB
#2**11, # 2KiB
2**12, # 4KiB = default
#2**13, # 8KiB
2**14, # 16KiB
#2**15, # 32KiB
2**16, # 64KiB = max
]
# creation time is not relevant
# users want fast random read access
# iso: 600sec for 930MB
# sqlite: 150sec for 930MB
# opensubs.db: 1 - 9180517
# 9180518: not found
continue_from = 9180519
# opensubtitles.org.dump.9180519.to.9521948.by.lang.2023.04.26
continue_from = 9521948 + 1
# opensubtitles.org.dump.9521949.to.xxxxxxx.by.lang.2023.05.xx
week_seconds = 7 * 24 * 60 * 60
# create weekly releases
# first interval starts on unix epoch: Thursday 1970-01-01 00:00:00
#release_interval_seconds = week_seconds
# create a release every X nums
release_interval_nums = 100 * 1000 # 100K
# give moderators 1 week to delete files
# renaming files would require refetching files
# or deriving new names from subtitles_month.txt.gz
delay_release = week_seconds
# generated by opensubs-metadata-dump-json.py
sub_dates_dir = "opensubtitles-scraper-sub-dates"
sub_dates_remote_url = "https://github.com/milahu/opensubtitles-scraper-sub-dates"
sub_dates_part_size = 100 * 1000 # 100K
import subprocess
import sys
import os
import glob
import io
import math
import re
import shutil
import time
import hashlib
import sqlite3
import shlex
import shutil
import json
import datetime
import natsort
os.makedirs(sub_dates_dir, exist_ok=True)
# TODO get num range of previous release
# opensubtitles.org.dump.9180519.to.9521948.by.lang.2023.04.26
previous_release_last_num = 9521948
release_first_num = previous_release_last_num + 1 # 9521949
release_idx = release_first_num // release_interval_nums
# TODO get num range of this release
release_last_num = (release_idx + 1) * release_interval_nums - 1
next_release_first_num = release_last_num + 1
print("release_idx", release_idx)
print("release_first_num", release_first_num)
print("release_last_num", release_last_num)
print("next_release_first_num", next_release_first_num)
# TODO init git repo at sub_dates_dir
# TODO update sub_dates
# last part can be incomplete -> start update at last part
print(f"sub_dates: listing remote branches")
args = [
"git",
"ls-remote",
sub_dates_remote_url,
]
debug_print(shlex.join(args))
proc = subprocess.run(
args,
check=True,
timeout=30,
capture_output=True,
encoding="utf8",
)
#print("proc.stdout", repr(proc.stdout))
remote_part_idx_list = []
for line in proc.stdout.strip().split("\n"):
commit, ref = line.split("\t")
if not ref.startswith("refs/heads/parts/"):
continue
part_idx = int(ref[17:])
#print(f"remote part {part_idx} = ref {ref} = commit {commit}")
remote_part_idx_list.append(part_idx)
remote_part_idx_list = sorted(remote_part_idx_list)
last_remote_part_idx = remote_part_idx_list[-1]
print(f"remote_part_idx_list: {remote_part_idx_list}")
print(f"last_remote_part_idx: {last_remote_part_idx}")
raise NotImplementedError
# fetch last part
# fetch needed parts of sub_dates
sub_dates_first_part_idx = release_first_num // sub_dates_part_size
sub_dates_last_part_idx = release_last_num // sub_dates_part_size
print("sub_dates_first_part_idx", sub_dates_first_part_idx)
print("sub_dates_last_part_idx", sub_dates_last_part_idx)
for part_idx in range(sub_dates_first_part_idx, sub_dates_last_part_idx + 1):
print(f"sub_dates {part_idx}: fetching branch parts/{part_idx} ...")
args = [
"git",
"-C", sub_dates_dir,
"fetch",
#"--verbose",
sub_dates_remote_url,
f"parts/{part_idx}:parts/{part_idx}",
]
debug_print(shlex.join(args))
try:
proc = subprocess.run(
args,
check=True,
timeout=30,
)
print(f"sub_dates {part_idx}: fetching branch parts/{part_idx} done")
except subprocess.CalledProcessError as error:
print(f"sub_dates {part_idx}: fetching branch parts/{part_idx} failed")
# fatal: refusing to fetch into branch 'refs/heads/parts/95' checked out at 'opensubtitles-scraper-sub-dates/parts/95'
worktree_path = f"{sub_dates_dir}/parts/{part_idx}"
if os.path.exists(worktree_path):
# remove old worktree
args = [
"git",
"-C", sub_dates_dir,
"worktree",
"remove",
#"--force",
f"parts/{part_idx}", # worktree path
]
debug_print(shlex.join(args))
proc = subprocess.run(
args,
check=True,
timeout=10,
)
# add worktree
args = [
"git",
"-C", sub_dates_dir,
"worktree",
"add",
"--quiet",
f"parts/{part_idx}", # worktree path
f"parts/{part_idx}", # branch name
]
debug_print(shlex.join(args))
proc = subprocess.run(
args,
check=True,
timeout=10,
)
# TODO check range of part
sub_dates_part_path = f"{worktree_path}/sub-dates.100k.{part_idx}.jsonl"
print(f"sub_dates {part_idx}: sub_dates_part_path: {repr(sub_dates_part_path)}")
sub_dates_part_size = os.path.getsize(sub_dates_part_path)
with open(sub_dates_part_path) as f:
first_line = next(f).strip()
print(f"sub_dates {part_idx}: first_line: {repr(first_line)}")
# jsonlines format
assert first_line.startswith("[")
first_num, first_time = json.loads(first_line)
print(f"sub_dates {part_idx}: first_num: {repr(first_num)}")
print(f"sub_dates {part_idx}: first_time: {repr(first_time)}")
# assume UTC, not local time
first_date = datetime.datetime.utcfromtimestamp(first_time)
#first_date = datetime.datetime.fromtimestamp(first_time)
first_date_str = first_date.strftime(r"%F %T")
print(f"sub_dates {part_idx}: first_date_str: {repr(first_date_str)}")
last_line_read_bytes = 1000
# ValueError: negative seek position -1000
#f.seek(-1 * last_line_read_bytes)
f.seek(sub_dates_part_size - last_line_read_bytes)
last_line = f.read(last_line_read_bytes).split("\n")[-2]
print(f"sub_dates {part_idx}: last_line: {repr(last_line)}")
# jsonlines format
assert last_line.startswith("[")
last_num, last_time = json.loads(last_line)
print(f"sub_dates {part_idx}: last_num: {repr(last_num)}")
print(f"sub_dates {part_idx}: last_time: {repr(last_time)}")
# assume UTC, not local time
last_date = datetime.datetime.utcfromtimestamp(last_time)
#last_date = datetime.datetime.fromtimestamp(last_time)
last_date_str = last_date.strftime(r"%F %T")
print(f"sub_dates {part_idx}: last_date_str: {repr(last_date_str)}")
raise NotImplementedError
raise NotImplementedError
"""
subtitles_month_txt_gz_path = "subtitles_month.txt.gz"
update_metadata = False
if not os.path.exists(subtitles_month_txt_gz_path):
update_metadata = True
else:
# file exists. check mtime
file_age = time.time() - os.path.getmtime(subtitles_month_txt_gz_path)
"""
"""
TODO "weekly" releases
get https://dl.opensubtitles.org/addons/export/subtitles_month.txt.gz
that url is updated every day at 10:00:00 local time (+0100)
parse subtitles_month.txt.gz to subtitles_month.db
see subtitles_all.txt.gz-parse.py
pull/push with https://github.com/milahu/opensubtitles-scraper-sub-dates
get time range of release
week_id 2782 = Thu 2023-04-27 00:00:00 to Wed 2023-05-03 23:59:59 = release Thu 2023-05-04
week_idx 2782 = Thu 2023-04-27 00:00:00 to Wed 2023-05-03 23:59:59 = release Thu 2023-05-04
note: times in UTC
get sub nums of release
note: time is not monotonic. TODO handle outliers?
IDSubtitle versus SubAddDate
IDSubtitle is monotonic -> TODO prefer IDSubtitle
-> not "weekly" releases, but releases grouped by IDSubtitle range
1 day = about 2K subs
5 days = about 10K subs # this
1 week = about 14K subs
groups of 10K:
group 0: num 0*10K to (1*10K-1)
group 1: num 1*10K to (2*10K-1)
group 2: num 2*10K to (3*10K-1)
group 3: num 3*10K to (4*10K-1)
groups are defined ONLY by IDSubtitle
if upstreams stops releasing new subtitles
or if our scraper breaks
then we dont make a "half release"
because our "live releases" are available at
https://github.com/milahu/opensubtitles-scraper-new-subs
continue_from = 9521948
sql_query = (
f"SELECT IDSubtitle, SubAddDate"
f" FROM subz_metadata"
f" WHERE IDSubtitle > {continue_from}"
#f" AND SubAddDate LIKE '2023-05-03 %'"
#f" ORDER BY IDSubtitle"
)
or_clauses = []
for day in release_days:
datestr = "2023-05-03" # TODO
or_clauses.append(f"SubAddDate LIKE '{datestr} %'")
sql_query += f" AND (" + " OR ".join(or_clauses) + ")"
sql_query += f" ORDER BY IDSubtitle"
print(sql_query)
time ranges:
first 3 weeks:
$ for week_idx in $(seq 0 3); do echo week_idx $week_idx = $(LC_ALL=C date --utc -d "1970-01-01+$((week_idx * 604800))sec" +"%a %F %T") to $(LC_ALL=C date --utc -d "1970-01-01+$(((week_idx + 1) * 604800 - 1))sec" +"%a %F %T") = release $(LC_ALL=C date --utc -d "1970-01-01+$(((week_idx + 1) * 604800))sec" +"%a %F"); done
week_idx 0 = Thu 1970-01-01 00:00:00 to Wed 1970-01-07 23:59:59 = release Thu 1970-01-08
week_idx 1 = Thu 1970-01-08 00:00:00 to Wed 1970-01-14 23:59:59 = release Thu 1970-01-15
week_idx 2 = Thu 1970-01-15 00:00:00 to Wed 1970-01-21 23:59:59 = release Thu 1970-01-22
current weeks:
$ for week_idx in $(seq 2780 2790); do echo week_idx $week_idx = $(LC_ALL=C date --utc -d "1970-01-01+$((week_idx * 604800))sec" +"%a %F %T") to $(LC_ALL=C date --utc -d "1970-01-01+$(((week_idx + 1) * 604800 - 1))sec" +"%a %F %T") = release $(LC_ALL=C date --utc -d "1970-01-01+$(((week_idx + 1) * 604800))sec" +"%a %F"); done
week_idx 2780 = Thu 2023-04-13 00:00:00 to Wed 2023-04-19 23:59:59 = release Thu 2023-04-20
week_idx 2781 = Thu 2023-04-20 00:00:00 to Wed 2023-04-26 23:59:59 = release Thu 2023-04-27
week_idx 2782 = Thu 2023-04-27 00:00:00 to Wed 2023-05-03 23:59:59 = release Thu 2023-05-04
week_idx 2783 = Thu 2023-05-04 00:00:00 to Wed 2023-05-10 23:59:59 = release Thu 2023-05-11
week_idx 2784 = Thu 2023-05-11 00:00:00 to Wed 2023-05-17 23:59:59 = release Thu 2023-05-18
week_idx 2785 = Thu 2023-05-18 00:00:00 to Wed 2023-05-24 23:59:59 = release Thu 2023-05-25
week_idx 2786 = Thu 2023-05-25 00:00:00 to Wed 2023-05-31 23:59:59 = release Thu 2023-06-01
week_idx 2787 = Thu 2023-06-01 00:00:00 to Wed 2023-06-07 23:59:59 = release Thu 2023-06-08
week_idx 2788 = Thu 2023-06-08 00:00:00 to Wed 2023-06-14 23:59:59 = release Thu 2023-06-15
week_idx 2789 = Thu 2023-06-15 00:00:00 to Wed 2023-06-21 23:59:59 = release Thu 2023-06-22
week_idx 2790 = Thu 2023-06-22 00:00:00 to Wed 2023-06-28 23:59:59 = release Thu 2023-06-29
"""
# reproducible filesystem images
# https://reproducible-builds.org/docs/system-images/
# https://unix.stackexchange.com/questions/572751/how-to-make-a-reproducible-iso-file-with-mkisofs-genisoimage
# validate config
if use_short_filenames == False:
assert store_filenames == False, "storing the long filenames only makes sense with use_short_filenames = True"
"""
mount image:
mkdir mnt
sudo mount -o loop,ro test.iso mnt
# ls -U: dont sort. files are already sorted in the filesystem
ls -U mnt | head
sudo umount mnt
"""
"""
# wontfix: pycdlib can create only UDF version 2.60
# but we want 2.01 or 1.50 for compatibility
# https://github.com/clalancette/pycdlib/issues/113
# create reproducible UDF image
# set all times to zero
import time
def zero_time():
return 0.0
time.time = zero_time
# set all uuid's to zero
import uuid
real_uuid = uuid.UUID
def zero_uuid():
return real_uuid(hex="00000000000000000000000000000000")
uuid.UUID = zero_uuid
real_uuid4 = uuid.uuid4
def zero_uuid4():
return real_uuid4(hex="00000000000000000000000000000000")
uuid.uuid4 = zero_uuid4
# set random bits to zero
import random
def zero_getrandbits(k):
return 0
random.getrandbits = zero_getrandbits
import pycdlib
"""
# https://en.wikipedia.org/wiki/DVD
# Capacity: 4.7 GB (single-sided, single-layer – common)
# DVD-5: 4.70GB
# All units are expressed with SI/IEC prefixes (i.e., 1 Gigabyte = 1,000,000,000 bytes).
dvd_size = int(4.7 * 1000 * 1000 * 1000)
#max_size = 1000 * 1000 * 1000 # 1 GB
# max_size criteria:
# - smaller than 1GB
# - align to size of DVD
#max_size = (dvd_size // 5) - 5 # 935 MB
# remaining space on DVD: 5 * 5MB = 25MB = 0.53%
# dont split, we have only 8 GB
max_size = 100 * 1000 * 1000 * 1000 # 100 GB
size_tolerance = 0.02 # reserve 2% for filesystem headers
size_tolerance_udf = 0.02 # reserve 2% in UDF filesystem
#udf_media_type = "hd" # OSError: [Errno 28] No space left on device
udf_media_type = "dvdrw"
udf_enable_vat = True
udf_enable_vat = False
# setting blocksize causes weird errors
#udf_block_size = 512
udf_block_size = None
# For normal data, UDF 1.50 is OK.
# UDF 2.00 and 2.01 introduce additional functionality for streaming audio/video.
# https://github.com/pali/udftools/blob/master/doc/HOWTO.udf
#udf_version = "2.01"
udf_version = "1.50"
# minimum blocks_count depends on format
# mkudffs: Error: Not enough blocks on device
udf_min_blocks_count = 260 # --media-type=dvdrw --vat
if udf_media_type == "hd":
if udf_enable_vat:
udf_min_blocks_count = 260
else:
udf_min_blocks_count = 131
elif udf_media_type == "dvdrw":
if udf_enable_vat:
udf_min_blocks_count = 300
else:
udf_min_blocks_count = 2000
if False:
#if True:
# debug
input_dir_short_filenames = "new-subs-sample"
max_size = 10 * 1000 * 1000 # 10 MB # debug
def create_empty_udf_image(udf_image_path, blocks_count, label):
print(f"creating test UDF image: {udf_image_path}")
group_label = label
args = [
"mkudffs",
"--utf8", # Treat identifier string options as strings encoded in UTF-8.
"--label=" + label,
"--vid=" + label, # Volume Identifier. default is "LinuxUDF"
"--vsid=" + group_label, # Volume Set Identifier. default is "LinuxUDF"
"--fsid=" + group_label, # File Set Identifier. default is "LinuxUDF"
"--uuid=" + (16 * "0"), # 16 hexadecimal lowercase digits. default is random
# In most cases operating systems are unable to mount UDF filesystem if UDF block size differs from logical sector size of device. Typically hard
# disks have sector size 512 bytes and optical media 2048 bytes. Therefore UDF block size must match logical sector size of device.
f"--media-type={udf_media_type}",
f"--udfrev={udf_version}",
"--new-file", # Create a new image file, fail if file already exists
"--uid=0",
"--gid=0",
"--mode=0755", # mode of the root (/) directory. default is "0755"
#"-path-list", sum_files_file,
# Virtual Allocation Table a.k.a. VAT (Incremental Writing).
# Used specifically for writing to write-once media
]
if udf_block_size:
args += [f"--blocksize={udf_block_size}"]
if udf_enable_vat:
args += ["--vat"]
args += [
udf_image_path, # device
str(blocks_count),
]
proc = subprocess.run(
args,
check=True,
)
assert os.path.exists(udf_image_path), f"mkudffs failed to create UDF image: {udf_image_path}"
#os.chmod(udf_image_path, 0o644)
def create_empty_iso_image(iso_image_path, volid):
print(f"creating test ISO image: {iso_image_path}")
args = [
"xorrisofs", # mkisofs compatibility mode of xorriso
"-volid", volid,
"-output", iso_image_path, # If not specified, stdout is used.
#"-path-list", sum_files_file,
]
proc = subprocess.run(
args,
check=True,
)
assert os.path.exists(iso_image_path), f"xorrisofs failed to create ISO image: {iso_image_path}"
#os.chmod(iso_image_path, 0o644)
def mount_udf_image(udf_image_path, mount_dir):
print(f"mounting UDF image: {udf_image_path}")
# TODO set file times to zero (ctime, mtime, atime)
# https://github.com/wolfcw/libfaketime
# TZ=UTC faketime "1970-01-01 00:00:00" date +%s --utc
mount_options = [
"loop",
"rw", # read-write
"noatime", # Do not update access times for files on this filesystem.
# https://www.kernel.org/doc/Documentation/filesystems/udf.txt
"uid=0", # default user
"gid=0", # default group
"mode=0644", # default file permissions
"dmode=0755", # default directory permissions
#"umask=xxx", # default umask
]
args = [
"mount",
"-o", ",".join(mount_options),
"-t", "udf",
udf_image_path,
mount_dir,
]
print("args", args)
proc = subprocess.run(
args,
check=True,
)
def mount_iso_image(iso_image_path, mount_dir):
print(f"mounting ISO image: {iso_image_path}")
# TODO set file times to zero (ctime, mtime, atime)
# https://github.com/wolfcw/libfaketime
# TZ=UTC faketime "1970-01-01 00:00:00" date +%s --utc
mount_options = [
"loop",
"ro", # read only
]
args = [
"mount",
"-o", ",".join(mount_options),
"-t", "iso9660",
iso_image_path,
mount_dir,
]
print("args", args)
proc = subprocess.run(
args,
check=True,
)
def unmount_dir(mount_dir):
print(f"unmounting dir: {mount_dir}")
args = [
"umount",
mount_dir,
]
proc = subprocess.run(
args,
check=True,
)
def test_mount_udf():
# check if we can mount
# create empty image file
udf_image_path = "new-subs-archive.py-tmp.udf"
create_empty_udf_image(udf_image_path, udf_min_blocks_count, "test")
mount_dir = "new-subs-archive.py-tmp-mnt"
os.makedirs(mount_dir, exist_ok=True)
# unmount previously mounted image
try:
unmount_dir(mount_dir)
except subprocess.CalledProcessError:
pass
try:
mount_udf_image(udf_image_path, mount_dir)
except subprocess.CalledProcessError:
os.unlink(udf_image_path)
os.rmdir(mount_dir)
raise Exception(f"error: need root privileges to mount UDF image. hint: sudo python3 {sys.argv[0]}")
unmount_dir(mount_dir)
os.unlink(udf_image_path)
os.rmdir(mount_dir)
def test_mount_iso():
# check if we can mount
# create empty image file
iso_image_path = "new-subs-archive.py-tmp.iso"
create_empty_iso_image(iso_image_path, "TEST")
mount_dir = "new-subs-archive.py-tmp-mnt"
os.makedirs(mount_dir, exist_ok=True)
# unmount previously mounted image
try:
unmount_dir(mount_dir)
except subprocess.CalledProcessError:
pass
try:
mount_iso_image(iso_image_path, mount_dir)
except subprocess.CalledProcessError:
os.unlink(iso_image_path)
os.rmdir(mount_dir)
raise Exception(f"error: need root privileges to mount ISO image. hint: sudo python3 {sys.argv[0]}")
unmount_dir(mount_dir)
os.unlink(iso_image_path)
os.rmdir(mount_dir)
# https://stackoverflow.com/a/1131238/10440128
def md5_filepath(filepath):
file_hash = hashlib.md5()
with open(filepath, "rb") as f:
while chunk := f.read(8192):
file_hash.update(chunk)
return file_hash.hexdigest()
def pack_files(sum_files, sum_size):
output_paths = []
if repeat_count == 1:
# dont repeat
return pack_files_inner(sum_files, sum_size)
print(f"creating {repeat_count} identical images ...")
for _ in range(repeat_count):
output_path = pack_files_inner(sum_files, sum_size)
output_paths.append(output_path)
print(f"creating {repeat_count} identical images done")
print(f"identical image files:")
for output_path in output_paths:
print(f" {output_path}")
print(f"comparing checksums of {repeat_count} identical images ...")
checksums = []
print(f"identical image checksums:")
for output_path in output_paths:
checksum = md5_filepath(output_path)
print(f" {checksum} {output_path}")
# compare to all previous checksums
# fail on the first mismatch
for previous_checksum in checksums:
assert checksum == previous_checksum, "failed to produce identical image files"
checksums.append(checksum)
def pack_files_inner(sum_files, sum_size):
# sum_files is sorted by natsorted = numeric sort
first_file = sum_files[0]
last_file = sum_files[-1]
if last_file.endswith("/filenames.txt"):
last_file = sum_files[-2]
print(f"first_file {first_file}")
print(f"last_file {last_file}")
first_num = int(os.path.basename(first_file).split(".")[0])
last_num = int(os.path.basename(last_file).split(".")[0])
sum_files = sorted(sum_files)
def get_archive_path(first_num, last_num, file_extension, suffix_before_duplicate=None):
archive_path = f"opensubtitles-{first_num}-{last_num}.{file_extension}"
if suffix_before_duplicate:
archive_path = f"opensubtitles-{first_num}-{last_num}-{suffix_before_duplicate}.{file_extension}"
duplicate = 1
while os.path.exists(archive_path):
duplicate += 1
archive_path = f"opensubtitles-{first_num}-{last_num}.{duplicate}.{file_extension}"
if suffix_before_duplicate:
archive_path = f"opensubtitles-{first_num}-{last_num}-{suffix_before_duplicate}.{duplicate}.{file_extension}"
return archive_path
if output_format == "tar":
# note: uncompressed tar, because content is compressed (zip files)
file_extension = "tar"
archive_path = get_archive_path(first_num, last_num, file_extension)
pack_files_tar(archive_path, sum_files)
return archive_path
if output_format == "iso":
file_extension = "iso"
archive_path = get_archive_path(first_num, last_num, file_extension)
volid = f"OPENSUBTITLES_{first_num}_{last_num}"
pack_files_iso(archive_path, sum_files, volid)
return archive_path
if output_format == "udf":
# mkudffs creates pure UDF, so we use extension "udf"
file_extension = "udf"
archive_path = get_archive_path(first_num, last_num, file_extension)
label = f"opensubtitles-{first_num}-{last_num}"
#group_label = f"opensubtitles"
pack_files_udf(archive_path, sum_files, label, sum_size)
return archive_path
if output_format == "udf-pycdlib":
# pycdlib creates impure UDF, so we use extension "iso"
file_extension = "iso"
archive_path = get_archive_path(first_num, last_num, file_extension)
label = f"opensubtitles-{first_num}-{last_num}"
#group_label = f"opensubtitles"
pack_files_udf_pycdlib(archive_path, sum_files, label, sum_size)
return archive_path
if output_format == "sqlite":
table_name = "zipfiles"
file_extension = "db" # short, ambiguous
#file_extension = "sqlite" # explicit, also used by archive.org for metadata
if sqlite_group_by_language:
files_by_lang = dict()
for filepath in sum_files:
# parse lang from filename
# 000000001.alien.3.(1992).eng.2cd.zip
# FIXME filepath is a bad source for language
# the filepath-languages can be wrong or missing ("und" = undefined language)
# instead, use metadata from subtitles_all.db
# TODO maybe make a new release to replace opensubtitles.org.dump.9180519.to.9521948.by.lang.2023.04.26
# to fix the language groups. add a migrate.py script so peers can fix their files
lang = filepath.split(".")[-3]
assert re.match(r"^[a-z]{3}$", lang)
if not lang in files_by_lang:
files_by_lang[lang] = list()
files_by_lang[lang].append(filepath)
archive_paths = []
for lang in files_by_lang:
archive_path = get_archive_path(first_num, last_num, file_extension, lang)
lang_files = files_by_lang[lang]
pack_files_sqlite(archive_path, lang_files, table_name)
archive_paths.append(archive_path)
return archive_paths
else:
archive_path = get_archive_path(first_num, last_num, file_extension)
pack_files_sqlite(archive_path, sum_files, table_name)
return archive_path
#elif output_format == "fat32":
# archive_path = f"opensubtitles-{first_num}-{last_num}.fat32"
# pack_files_fat32(archive_path, sum_files, sum_size)
assert False, f"unknown output_format: {output_format}"
def pack_files_tar(archive_path, sum_files):
print(f"packing {len(sum_files)} files to {archive_path}")
sum_files_file = "new-subs-archive.py-sum_files.txt"
with open(sum_files_file, "w") as f:
f.write("\n".join(sum_files) + "\n")
args = [
"tar",
# all these options are required to create reproducible archives
# https://reproducible-builds.org/docs/archives/
# TODO create reproducible archives with python tarfile
# so this also works on windows
"--format=gnu",
"--sort=name", # sort filenames, independent of locale. tar v1.28
"--mtime=0",
"--owner=0",
"--group=0",
"--numeric-owner",
"-c",
"-f", archive_path,
"-T", sum_files_file,
]
subprocess.run(
args,
check=True,
)
def pack_files_sqlite(db_path, sum_files, table_name, page_size=None):
print(f"creating database {db_path} ...")
t1 = time.time()
assert os.path.exists(db_path) == False, f"error: output file exists: {db_path}"
con = sqlite3.connect(db_path)
cur = con.cursor()
if page_size == None:
page_size = sqlite_page_size
cur.executescript(f"PRAGMA page_size = {sqlite_page_size}; VACUUM;")
cur.execute("PRAGMA count_changes=OFF")
cur.execute(
f"CREATE TABLE {table_name} (\n"
f" num INTEGER PRIMARY KEY,\n"
f" name TEXT,\n"
f" content BLOB\n"
f")"
)
"""
# no. store missing numbers as text files
cur.execute(
f"CREATE TABLE missing_404 (\n"
f" num INTEGER PRIMARY KEY\n"
f")"
)
sql_query = f"INSERT INTO missing_404 (num) VALUES (?)"
# ...
cur.execute(
f"CREATE TABLE missing_dcma (\n"
f" num INTEGER PRIMARY KEY\n"
f")"
)
sql_query = f"INSERT INTO missing_dcma (num) VALUES (?)"
# ...
"""
#sql_query = f"INSERT INTO {table_name} (num, name, content) VALUES (?, ?, ?)"
sql_query = f"INSERT INTO {table_name} VALUES (?, ?, ?)"
for file_path in sum_files:
file_name = os.path.basename(file_path)
name_parts = file_name.split(".")
num = int(name_parts[0])
assert name_parts[-1] == "zip", f"not a zip file: {file_path}"
# check for legacy file format before new-subs-rename-remove-num-part.py
assert name_parts[-2] != f"({num})", f"bad filename format: {file_path}"
name = ".".join(name_parts[1:-1])
# too complex
# store only files here
# and use a separate DB for all metadata
#lang = name_parts[-3]
#assert re.match(r"^[a-z]{3}$", lang)
with open(file_path, "rb") as f:
content = f.read()
sql_args = (num, name, content)
cur.execute(sql_query, sql_args)
con.commit()
con.close()
t2 = time.time()
print(f"creating database {db_path} done in {t2 - t1} seconds")
def pack_files_iso(iso_image_path, sum_files, volid):
"""
ignore this error? ISO seems fine. later: error is gone.
FIXME fails to create large iso of 1GB
libisofs: FATAL : Image is most likely damaged. Calculated/written tree end address mismatch.
libisofs: FATAL : Image is most likely damaged. Calculated/written image end address mismatch.
libburn : FAILURE : Premature end of input encountered. Missing: 2048 bytes
"""
t1 = time.time()
print(f"packing {len(sum_files)} files to {iso_image_path}")
print(f"creating image {iso_image_path} ...")
sum_files_file = "new-subs-archive.py-sum_files.txt"
with open(sum_files_file, "w") as f:
f.write("\n".join(sum_files) + "\n")
# TODO is this reproducible?
assert re.match(r"^[A-Z0-9_]{0,32}$", volid), f"invalid volid: {repr(volid)}"
# note: xorriso does not produce UDF filesystems
args = [
#"mkisofs",
"xorrisofs", # mkisofs compatibility mode of xorriso
"--modification-date=1970010100000000", # YYYYMMDDhhmmsscc
"--set_all_file_dates", "set_to_mtime",
"-uid", "0",
"-gid", "0",
"-volid", volid,
#"--gpt_disk_guid", "modification-date",
"--gpt_disk_guid", "00000000000000000000000000000000",
"-no-cache-inodes", # we have no hardlinks
"-dir-mode", "0755",
"-file-mode", "0644", # we have no executable files
# To create reproducible ISO-9660 filesystem images,
# the options: -creation-date, -effective-date, -modification-date and -noatime need to be specified
# and the -o option must not be used.
"-output", iso_image_path, # If not specified, stdout is used.
"-input-charset", "utf8",
"-preparer", "", # default: XORRISO-1.5.4 2021.02.06.123001, LIBISOBURN-1.5.4, LIBISOFS-1.5.4, LIBBURN-1.5.4
# TODO how to set file paths in the image?
# all files are written to the root directory
"-path-list", sum_files_file,
# Allow more than one dot in filenames (e.g. .tar.gz) (violates ISO9660)
# ignored by xorrisofs
#"-allow-multidot",
]
try:
proc = subprocess.run(
args,
check=True,
env={
"PATH": os.environ["PATH"],
"SOURCE_DATE_EPOCH": "0", # for xorriso
},
# capture output because xorrisofs is too verbose
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
encoding="utf8",
)
except subprocess.CalledProcessError as err:
print(f"creating image {iso_image_path} done with error")
print(f"xorrisofs output:")
print(proc.stdout)
print()
t2 = time.time()
dt = t2 - t1
print(f"creating image {iso_image_path} done in {dt} seconds")
assert os.path.exists(iso_image_path), f"xorrisofs failed to create image: {iso_image_path}"
# no. this takes long and requires root privileges
# easier to create two identical images and assert equality
# see repeat_count
check_files = False
if check_files:
# check md5sum of all files
t1 = time.time()
print(f"checking files in {iso_image_path} ...")
mount_dir = "new-subs-archive.py-tmp-mnt"
os.makedirs(mount_dir, exist_ok=True)
mount_iso_image(iso_image_path, mount_dir)
for idx, src_file_path in enumerate(sum_files):
if idx % 1000 == 0:
print(f"progress: done {idx} of {len(sum_files)} files = {idx/len(sum_files)*100:.1f}%")
with open(src_file_path, "rb") as f:
expected_md5 = hashlib.md5(f.read()).hexdigest()
dst_file_path = mount_dir + "/" + os.path.basename(src_file_path)
with open(dst_file_path, "rb") as f:
actual_md5 = hashlib.md5(f.read()).hexdigest()
if actual_md5 != expected_md5:
# cleanup
unmount_dir(mount_dir)
raise Exception(f"failed to verify file: {src_file_path} - expected md5: {expected_md5} - actual md5: {actual_md5}")
unmount_dir(mount_dir)
t2 = time.time()
dt = t2 - t1
print(f"checking files in {iso_image_path} done in {dt} seconds")
def pack_files_udf(output_path, sum_files, label, sum_size):
udf_image_path = output_path
print(f"packing {len(sum_files)} files to {output_path}")
sum_files_file = "new-subs-archive.py-sum_files.txt"
with open(sum_files_file, "w") as f:
f.write("\n".join(sum_files) + "\n")
# https://en.wikipedia.org/wiki/Universal_Disk_Format
# Max. volume size:
# 2 TiB (with 512-byte sectors)
# 8 TiB (with 2 KiB sectors, like most optical discs)
# 16 TiB (with 4 KiB sectors)
# Max. filename length 255 bytes (path 1023 bytes)
# note: dont use "genisoimage -udf" or "mkisofs -udf"
# as they do not create a "pure UDF" filesystem
# https://askubuntu.com/questions/1152527/creating-a-pure-udf-iso
blocksize = 512
#blocksize = 2048 # TODO?