new-subs-archive.py

#! /usr/bin/env python3


# TODO verify all 404 URLs
# TODO rename table to zipfiles
# TODO add table missing_404
# TODO add table missing_dcma


# folders with zip files
input_dir_long_filenames = "new-subs"

# folders with zip files
# short name format
# for output_format = "iso"
input_dir_short_filenames = "new-subs-num"

# use short filenames in archive
# short: 1.zip
# long : 1.alien.3.(1992).eng.2cd.zip
# short filenames are better for lookup by sub_number
# about 10x faster than glob (readdir + regex)
# for output_format = "iso"
use_short_filenames = True


repeat_count = 2


# verbose
debug_print = print
# quiet
#debug_print = lambda _: None


# not mountable, slow random access
# good for small releases with append-only?
#output_format = "tar"

# harder to use than sqlite?
#output_format = "iso"

# requires mount to write files? not reproducible: cannot set file times to zero
#output_format = "udf"

# only UDF version 2.60
#output_format = "udf-pycdlib"

# sqlite is reproducible by default. nice!
# but: file header changes when we append data
# at offset 28 + 4 bytes:
# Size of the database file in pages. The "in-header database size".
# https://www.sqlite.org/fileformat2.html#database_header
output_format = "sqlite"
# big releases (monthly)
sqlite_group_by_language = True
# small releases (daily)
#sqlite_group_by_language = False


if output_format == "tar":
    use_short_filenames = False
    repeat_count = 1 # we already know its reproducible

if output_format == "sqlite":
    use_short_filenames = False
    store_filenames = False # dont create filenames.txt
    repeat_count = 1 # we already know its reproducible

# create filenames.txt with the full filenames
store_filenames = use_short_filenames

# sqlite page size in bytes
# average file size is 20KB
# TODO benchmark
# re-create db with all page sizes:
# for page_size in 512 1024 2048 4096 8192 16384 32768 65536; do ( echo "PRAGMA page_size=$page_size;"; sqlite3 src.db .dump; ) | sqlite3 dst.pagesize-$page_size.db; done
"""
sqlite_page_size = 2**9 # 512 = min
sqlite_page_size = 2**10 # 1024 = 1K
sqlite_page_size = 2**11 # 2048 = 2K
sqlite_page_size = 2**12 # 4096 = 4K = default
sqlite_page_size = 2**13 # 8192 = 8K
sqlite_page_size = 2**14 # 16384 = 16K
sqlite_page_size = 2**15 # 32768 = 32K
sqlite_page_size = 2**16 # 65536 = 64K = max
"""
sqlite_page_size = 2**12 # 4096 = 4K = default

# benchmark
sqlite_compare_page_sizes = False
sqlite_page_sizes = [
    2**9, # 512B = min
    #2**10, # 1KiB
    #2**11, # 2KiB
    2**12, # 4KiB = default
    #2**13, # 8KiB
    2**14, # 16KiB
    #2**15, # 32KiB
    2**16, # 64KiB = max
]


# creation time is not relevant
# users want fast random read access
# iso: 600sec for 930MB
# sqlite: 150sec for 930MB


# opensubs.db: 1 - 9180517
# 9180518: not found
continue_from = 9180519
# opensubtitles.org.dump.9180519.to.9521948.by.lang.2023.04.26
continue_from = 9521948 + 1
# opensubtitles.org.dump.9521949.to.xxxxxxx.by.lang.2023.05.xx

week_seconds = 7 * 24 * 60 * 60

# create weekly releases
# first interval starts on unix epoch: Thursday 1970-01-01 00:00:00
#release_interval_seconds = week_seconds

# create a release every X nums
release_interval_nums = 100 * 1000 # 100K

# give moderators 1 week to delete files
# renaming files would require refetching files
# or deriving new names from subtitles_month.txt.gz
delay_release = week_seconds

# generated by opensubs-metadata-dump-json.py
sub_dates_dir = "opensubtitles-scraper-sub-dates"
sub_dates_remote_url = "https://github.com/milahu/opensubtitles-scraper-sub-dates"
sub_dates_part_size = 100 * 1000 # 100K


import subprocess
import sys
import os
import glob
import io
import math
import re
import shutil
import time
import hashlib
import sqlite3
import shlex
import shutil
import json
import datetime

import natsort


os.makedirs(sub_dates_dir, exist_ok=True)

# TODO get num range of previous release
# opensubtitles.org.dump.9180519.to.9521948.by.lang.2023.04.26
previous_release_last_num = 9521948

release_first_num = previous_release_last_num + 1 # 9521949

release_idx = release_first_num // release_interval_nums

# TODO get num range of this release
release_last_num = (release_idx + 1) * release_interval_nums - 1
next_release_first_num = release_last_num + 1

print("release_idx", release_idx)
print("release_first_num", release_first_num)
print("release_last_num", release_last_num)
print("next_release_first_num", next_release_first_num)

# TODO init git repo at sub_dates_dir

# TODO update sub_dates
# last part can be incomplete -> start update at last part
print(f"sub_dates: listing remote branches")
args = [
    "git",
    "ls-remote",
    sub_dates_remote_url,
]
debug_print(shlex.join(args))
proc = subprocess.run(
    args,
    check=True,
    timeout=30,
    capture_output=True,
    encoding="utf8",
)
#print("proc.stdout", repr(proc.stdout))
remote_part_idx_list = []
for line in proc.stdout.strip().split("\n"):
    commit, ref = line.split("\t")
    if not ref.startswith("refs/heads/parts/"):
        continue
    part_idx = int(ref[17:])
    #print(f"remote part {part_idx} = ref {ref} = commit {commit}")
    remote_part_idx_list.append(part_idx)
remote_part_idx_list = sorted(remote_part_idx_list)
last_remote_part_idx = remote_part_idx_list[-1]
print(f"remote_part_idx_list: {remote_part_idx_list}")
print(f"last_remote_part_idx: {last_remote_part_idx}")
raise NotImplementedError
# fetch last part


# fetch needed parts of sub_dates
sub_dates_first_part_idx = release_first_num // sub_dates_part_size
sub_dates_last_part_idx = release_last_num // sub_dates_part_size
print("sub_dates_first_part_idx", sub_dates_first_part_idx)
print("sub_dates_last_part_idx", sub_dates_last_part_idx)

for part_idx in range(sub_dates_first_part_idx, sub_dates_last_part_idx + 1):
    print(f"sub_dates {part_idx}: fetching branch parts/{part_idx} ...")
    args = [
        "git",
        "-C", sub_dates_dir,
        "fetch",
        #"--verbose",
        sub_dates_remote_url,
        f"parts/{part_idx}:parts/{part_idx}",
    ]
    debug_print(shlex.join(args))
    try:
        proc = subprocess.run(
            args,
            check=True,
            timeout=30,
        )
        print(f"sub_dates {part_idx}: fetching branch parts/{part_idx} done")
    except subprocess.CalledProcessError as error:
        print(f"sub_dates {part_idx}: fetching branch parts/{part_idx} failed")
        # fatal: refusing to fetch into branch 'refs/heads/parts/95' checked out at 'opensubtitles-scraper-sub-dates/parts/95'

    worktree_path = f"{sub_dates_dir}/parts/{part_idx}"
    if os.path.exists(worktree_path):
        # remove old worktree
        args = [
            "git",
            "-C", sub_dates_dir,
            "worktree",
            "remove",
            #"--force",
            f"parts/{part_idx}", # worktree path
        ]
        debug_print(shlex.join(args))
        proc = subprocess.run(
            args,
            check=True,
            timeout=10,
        )

    # add worktree
    args = [
        "git",
        "-C", sub_dates_dir,
        "worktree",
        "add",
        "--quiet",
        f"parts/{part_idx}", # worktree path
        f"parts/{part_idx}", # branch name
    ]
    debug_print(shlex.join(args))
    proc = subprocess.run(
        args,
        check=True,
        timeout=10,
    )
    # TODO check range of part
    sub_dates_part_path = f"{worktree_path}/sub-dates.100k.{part_idx}.jsonl"
    print(f"sub_dates {part_idx}: sub_dates_part_path: {repr(sub_dates_part_path)}")
    sub_dates_part_size = os.path.getsize(sub_dates_part_path)
    with open(sub_dates_part_path) as f:
        first_line = next(f).strip()
        print(f"sub_dates {part_idx}: first_line: {repr(first_line)}")
        # jsonlines format
        assert first_line.startswith("[")
        first_num, first_time = json.loads(first_line)
        print(f"sub_dates {part_idx}: first_num: {repr(first_num)}")
        print(f"sub_dates {part_idx}: first_time: {repr(first_time)}")
        # assume UTC, not local time
        first_date = datetime.datetime.utcfromtimestamp(first_time)
        #first_date = datetime.datetime.fromtimestamp(first_time)
        first_date_str = first_date.strftime(r"%F %T")
        print(f"sub_dates {part_idx}: first_date_str: {repr(first_date_str)}")
        last_line_read_bytes = 1000
        # ValueError: negative seek position -1000
        #f.seek(-1 * last_line_read_bytes)
        f.seek(sub_dates_part_size - last_line_read_bytes)
        last_line = f.read(last_line_read_bytes).split("\n")[-2]
        print(f"sub_dates {part_idx}: last_line: {repr(last_line)}")
        # jsonlines format
        assert last_line.startswith("[")
        last_num, last_time = json.loads(last_line)
        print(f"sub_dates {part_idx}: last_num: {repr(last_num)}")
        print(f"sub_dates {part_idx}: last_time: {repr(last_time)}")
        # assume UTC, not local time
        last_date = datetime.datetime.utcfromtimestamp(last_time)
        #last_date = datetime.datetime.fromtimestamp(last_time)
        last_date_str = last_date.strftime(r"%F %T")
        print(f"sub_dates {part_idx}: last_date_str: {repr(last_date_str)}")
        raise NotImplementedError


raise NotImplementedError

"""
subtitles_month_txt_gz_path = "subtitles_month.txt.gz"
update_metadata = False
if not os.path.exists(subtitles_month_txt_gz_path):
    update_metadata = True
else:
    # file exists. check mtime
    file_age = time.time() - os.path.getmtime(subtitles_month_txt_gz_path)
"""


"""
TODO "weekly" releases

get https://dl.opensubtitles.org/addons/export/subtitles_month.txt.gz
that url is updated every day at 10:00:00 local time (+0100)

parse subtitles_month.txt.gz to subtitles_month.db
see subtitles_all.txt.gz-parse.py
pull/push with https://github.com/milahu/opensubtitles-scraper-sub-dates

get time range of release
week_id 2782 = Thu 2023-04-27 00:00:00 to Wed 2023-05-03 23:59:59 = release Thu 2023-05-04
week_idx 2782 = Thu 2023-04-27 00:00:00 to Wed 2023-05-03 23:59:59 = release Thu 2023-05-04
note: times in UTC

get sub nums of release
note: time is not monotonic. TODO handle outliers?
IDSubtitle versus SubAddDate
IDSubtitle is monotonic -> TODO prefer IDSubtitle
-> not "weekly" releases, but releases grouped by IDSubtitle range
1 day = about 2K subs
5 days = about 10K subs # this
1 week = about 14K subs

groups of 10K:
group 0: num 0*10K to (1*10K-1)
group 1: num 1*10K to (2*10K-1)
group 2: num 2*10K to (3*10K-1)
group 3: num 3*10K to (4*10K-1)

groups are defined ONLY by IDSubtitle
if upstreams stops releasing new subtitles
or if our scraper breaks
then we dont make a "half release"
because our "live releases" are available at
https://github.com/milahu/opensubtitles-scraper-new-subs


continue_from = 9521948
sql_query = (
    f"SELECT IDSubtitle, SubAddDate"
    f" FROM subz_metadata"
    f" WHERE IDSubtitle > {continue_from}"
    #f" AND SubAddDate LIKE '2023-05-03 %'"
    #f" ORDER BY IDSubtitle"
)
or_clauses = []
for day in release_days:
    datestr = "2023-05-03" # TODO
    or_clauses.append(f"SubAddDate LIKE '{datestr} %'")
sql_query += f" AND (" + " OR ".join(or_clauses) + ")"
sql_query += f" ORDER BY IDSubtitle"
print(sql_query)

time ranges:

first 3 weeks:
$ for week_idx in $(seq 0 3); do echo week_idx $week_idx = $(LC_ALL=C date --utc -d "1970-01-01+$((week_idx * 604800))sec" +"%a %F %T") to $(LC_ALL=C date --utc -d "1970-01-01+$(((week_idx + 1) * 604800 - 1))sec" +"%a %F %T") = release $(LC_ALL=C date --utc -d "1970-01-01+$(((week_idx + 1) * 604800))sec" +"%a %F"); done
week_idx 0 = Thu 1970-01-01 00:00:00 to Wed 1970-01-07 23:59:59 = release Thu 1970-01-08
week_idx 1 = Thu 1970-01-08 00:00:00 to Wed 1970-01-14 23:59:59 = release Thu 1970-01-15
week_idx 2 = Thu 1970-01-15 00:00:00 to Wed 1970-01-21 23:59:59 = release Thu 1970-01-22

current weeks:
$ for week_idx in $(seq 2780 2790); do echo week_idx $week_idx = $(LC_ALL=C date --utc -d "1970-01-01+$((week_idx * 604800))sec" +"%a %F %T") to $(LC_ALL=C date --utc -d "1970-01-01+$(((week_idx + 1) * 604800 - 1))sec" +"%a %F %T") = release $(LC_ALL=C date --utc -d "1970-01-01+$(((week_idx + 1) * 604800))sec" +"%a %F"); done 
week_idx 2780 = Thu 2023-04-13 00:00:00 to Wed 2023-04-19 23:59:59 = release Thu 2023-04-20
week_idx 2781 = Thu 2023-04-20 00:00:00 to Wed 2023-04-26 23:59:59 = release Thu 2023-04-27
week_idx 2782 = Thu 2023-04-27 00:00:00 to Wed 2023-05-03 23:59:59 = release Thu 2023-05-04
week_idx 2783 = Thu 2023-05-04 00:00:00 to Wed 2023-05-10 23:59:59 = release Thu 2023-05-11
week_idx 2784 = Thu 2023-05-11 00:00:00 to Wed 2023-05-17 23:59:59 = release Thu 2023-05-18
week_idx 2785 = Thu 2023-05-18 00:00:00 to Wed 2023-05-24 23:59:59 = release Thu 2023-05-25
week_idx 2786 = Thu 2023-05-25 00:00:00 to Wed 2023-05-31 23:59:59 = release Thu 2023-06-01
week_idx 2787 = Thu 2023-06-01 00:00:00 to Wed 2023-06-07 23:59:59 = release Thu 2023-06-08
week_idx 2788 = Thu 2023-06-08 00:00:00 to Wed 2023-06-14 23:59:59 = release Thu 2023-06-15
week_idx 2789 = Thu 2023-06-15 00:00:00 to Wed 2023-06-21 23:59:59 = release Thu 2023-06-22
week_idx 2790 = Thu 2023-06-22 00:00:00 to Wed 2023-06-28 23:59:59 = release Thu 2023-06-29
"""

# reproducible filesystem images
# https://reproducible-builds.org/docs/system-images/
# https://unix.stackexchange.com/questions/572751/how-to-make-a-reproducible-iso-file-with-mkisofs-genisoimage


# validate config

if use_short_filenames == False:
    assert store_filenames == False, "storing the long filenames only makes sense with use_short_filenames = True"


"""
mount image:

mkdir mnt
sudo mount -o loop,ro test.iso mnt
# ls -U: dont sort. files are already sorted in the filesystem
ls -U mnt | head
sudo umount mnt
"""

"""
# wontfix: pycdlib can create only UDF version 2.60
# but we want 2.01 or 1.50 for compatibility
# https://github.com/clalancette/pycdlib/issues/113
# create reproducible UDF image
# set all times to zero
import time
def zero_time():
    return 0.0
time.time = zero_time
# set all uuid's to zero
import uuid
real_uuid = uuid.UUID
def zero_uuid():
    return real_uuid(hex="00000000000000000000000000000000")
uuid.UUID = zero_uuid
real_uuid4 = uuid.uuid4
def zero_uuid4():
    return real_uuid4(hex="00000000000000000000000000000000")
uuid.uuid4 = zero_uuid4
# set random bits to zero
import random
def zero_getrandbits(k):
    return 0
random.getrandbits = zero_getrandbits
import pycdlib
"""


# https://en.wikipedia.org/wiki/DVD
# Capacity: 4.7 GB (single-sided, single-layer – common)
# DVD-5: 4.70GB
# All units are expressed with SI/IEC prefixes (i.e., 1 Gigabyte = 1,000,000,000 bytes).
dvd_size = int(4.7 * 1000 * 1000 * 1000)


#max_size = 1000 * 1000 * 1000 # 1 GB
# max_size criteria:
# - smaller than 1GB
# - align to size of DVD
#max_size = (dvd_size // 5) - 5 # 935 MB
# remaining space on DVD: 5 * 5MB = 25MB = 0.53%

# dont split, we have only 8 GB
max_size = 100 * 1000 * 1000 * 1000 # 100 GB

size_tolerance = 0.02 # reserve 2% for filesystem headers

size_tolerance_udf = 0.02 # reserve 2% in UDF filesystem

#udf_media_type = "hd" # OSError: [Errno 28] No space left on device
udf_media_type = "dvdrw"

udf_enable_vat = True
udf_enable_vat = False

# setting blocksize causes weird errors
#udf_block_size = 512
udf_block_size = None

# For normal data, UDF 1.50 is OK.
# UDF 2.00 and 2.01 introduce additional functionality for streaming audio/video.
# https://github.com/pali/udftools/blob/master/doc/HOWTO.udf
#udf_version = "2.01"
udf_version = "1.50"

# minimum blocks_count depends on format
# mkudffs: Error: Not enough blocks on device
udf_min_blocks_count = 260 # --media-type=dvdrw --vat
if udf_media_type == "hd":
    if udf_enable_vat:
        udf_min_blocks_count = 260
    else:
        udf_min_blocks_count = 131
elif udf_media_type == "dvdrw":
    if udf_enable_vat:
        udf_min_blocks_count = 300
    else:
        udf_min_blocks_count = 2000


if False:
#if True:
    # debug
    input_dir_short_filenames = "new-subs-sample"
    max_size = 10 * 1000 * 1000 # 10 MB # debug


def create_empty_udf_image(udf_image_path, blocks_count, label):
    print(f"creating test UDF image: {udf_image_path}")
    group_label = label
    args = [
        "mkudffs",
        "--utf8", # Treat identifier string options as strings encoded in UTF-8.
        "--label=" + label,
        "--vid=" + label, # Volume Identifier. default is "LinuxUDF"
        "--vsid=" + group_label, # Volume Set Identifier. default is "LinuxUDF"
        "--fsid=" + group_label, # File Set Identifier. default is "LinuxUDF"
        "--uuid=" + (16 * "0"), # 16 hexadecimal lowercase digits. default is random
        # In most cases operating systems are unable to mount UDF filesystem if UDF block size differs from logical sector size of device.  Typically  hard
        # disks have sector size 512 bytes and optical media 2048 bytes. Therefore UDF block size must match logical sector size of device.
        f"--media-type={udf_media_type}",
        f"--udfrev={udf_version}",
        "--new-file", # Create a new image file, fail if file already exists
        "--uid=0",
        "--gid=0",
        "--mode=0755", # mode of the root (/) directory. default is "0755"
        #"-path-list", sum_files_file,
        # Virtual Allocation Table a.k.a. VAT (Incremental Writing).
        # Used specifically for writing to write-once media
    ]
    if udf_block_size:
        args += [f"--blocksize={udf_block_size}"]
    if udf_enable_vat:
        args += ["--vat"]
    args += [
        udf_image_path, # device
        str(blocks_count),
    ]
    proc = subprocess.run(
        args,
        check=True,
    )
    assert os.path.exists(udf_image_path), f"mkudffs failed to create UDF image: {udf_image_path}"
    #os.chmod(udf_image_path, 0o644)


def create_empty_iso_image(iso_image_path, volid):
    print(f"creating test ISO image: {iso_image_path}")
    args = [
        "xorrisofs", # mkisofs compatibility mode of xorriso
        "-volid", volid,
        "-output", iso_image_path, # If not specified, stdout is used.
        #"-path-list", sum_files_file,
    ]
    proc = subprocess.run(
        args,
        check=True,
    )
    assert os.path.exists(iso_image_path), f"xorrisofs failed to create ISO image: {iso_image_path}"
    #os.chmod(iso_image_path, 0o644)


def mount_udf_image(udf_image_path, mount_dir):
    print(f"mounting UDF image: {udf_image_path}")

    # TODO set file times to zero (ctime, mtime, atime)
    # https://github.com/wolfcw/libfaketime
    # TZ=UTC faketime "1970-01-01 00:00:00" date +%s --utc
    mount_options = [
        "loop",
        "rw", # read-write
        "noatime", # Do not update access times for files on this filesystem.
        # https://www.kernel.org/doc/Documentation/filesystems/udf.txt
        "uid=0", # default user
        "gid=0", # default group
        "mode=0644", # default file permissions
        "dmode=0755", # default directory permissions
        #"umask=xxx", # default umask
    ]
    args = [
        "mount",
        "-o", ",".join(mount_options),
        "-t", "udf",
        udf_image_path,
        mount_dir,
    ]
    print("args", args)
    proc = subprocess.run(
        args,
        check=True,
    )


def mount_iso_image(iso_image_path, mount_dir):
    print(f"mounting ISO image: {iso_image_path}")

    # TODO set file times to zero (ctime, mtime, atime)
    # https://github.com/wolfcw/libfaketime
    # TZ=UTC faketime "1970-01-01 00:00:00" date +%s --utc
    mount_options = [
        "loop",
        "ro", # read only
    ]
    args = [
        "mount",
        "-o", ",".join(mount_options),
        "-t", "iso9660",
        iso_image_path,
        mount_dir,
    ]
    print("args", args)
    proc = subprocess.run(
        args,
        check=True,
    )


def unmount_dir(mount_dir):
    print(f"unmounting dir: {mount_dir}")
    args = [
        "umount",
        mount_dir,
    ]
    proc = subprocess.run(
        args,
        check=True,
    )


def test_mount_udf():
    # check if we can mount
    # create empty image file
    udf_image_path = "new-subs-archive.py-tmp.udf"
    create_empty_udf_image(udf_image_path, udf_min_blocks_count, "test")
    mount_dir = "new-subs-archive.py-tmp-mnt"
    os.makedirs(mount_dir, exist_ok=True)
    # unmount previously mounted image
    try:
        unmount_dir(mount_dir)
    except subprocess.CalledProcessError:
        pass
    try:
        mount_udf_image(udf_image_path, mount_dir)
    except subprocess.CalledProcessError:
        os.unlink(udf_image_path)
        os.rmdir(mount_dir)
        raise Exception(f"error: need root privileges to mount UDF image. hint: sudo python3 {sys.argv[0]}")
    unmount_dir(mount_dir)
    os.unlink(udf_image_path)
    os.rmdir(mount_dir)


def test_mount_iso():
    # check if we can mount
    # create empty image file
    iso_image_path = "new-subs-archive.py-tmp.iso"
    create_empty_iso_image(iso_image_path, "TEST")
    mount_dir = "new-subs-archive.py-tmp-mnt"
    os.makedirs(mount_dir, exist_ok=True)
    # unmount previously mounted image
    try:
        unmount_dir(mount_dir)
    except subprocess.CalledProcessError:
        pass
    try:
        mount_iso_image(iso_image_path, mount_dir)
    except subprocess.CalledProcessError:
        os.unlink(iso_image_path)
        os.rmdir(mount_dir)
        raise Exception(f"error: need root privileges to mount ISO image. hint: sudo python3 {sys.argv[0]}")
    unmount_dir(mount_dir)
    os.unlink(iso_image_path)
    os.rmdir(mount_dir)


# https://stackoverflow.com/a/1131238/10440128
def md5_filepath(filepath):
    file_hash = hashlib.md5()
    with open(filepath, "rb") as f:
        while chunk := f.read(8192):
            file_hash.update(chunk)
    return file_hash.hexdigest()


def pack_files(sum_files, sum_size):
    output_paths = []
    if repeat_count == 1:
        # dont repeat
        return pack_files_inner(sum_files, sum_size)
    print(f"creating {repeat_count} identical images ...")
    for _ in range(repeat_count):
        output_path = pack_files_inner(sum_files, sum_size)
        output_paths.append(output_path)
    print(f"creating {repeat_count} identical images done")
    print(f"identical image files:")
    for output_path in output_paths:
        print(f"  {output_path}")
    print(f"comparing checksums of {repeat_count} identical images ...")
    checksums = []
    print(f"identical image checksums:")
    for output_path in output_paths:
        checksum = md5_filepath(output_path)
        print(f"  {checksum}  {output_path}")
        # compare to all previous checksums
        # fail on the first mismatch
        for previous_checksum in checksums:
            assert checksum == previous_checksum, "failed to produce identical image files"
        checksums.append(checksum)


def pack_files_inner(sum_files, sum_size):
    # sum_files is sorted by natsorted = numeric sort
    first_file = sum_files[0]
    last_file = sum_files[-1]
    if last_file.endswith("/filenames.txt"):
        last_file = sum_files[-2]
    print(f"first_file {first_file}")
    print(f"last_file {last_file}")
    first_num = int(os.path.basename(first_file).split(".")[0])
    last_num = int(os.path.basename(last_file).split(".")[0])
    sum_files = sorted(sum_files)
    def get_archive_path(first_num, last_num, file_extension, suffix_before_duplicate=None):
        archive_path = f"opensubtitles-{first_num}-{last_num}.{file_extension}"
        if suffix_before_duplicate:
            archive_path = f"opensubtitles-{first_num}-{last_num}-{suffix_before_duplicate}.{file_extension}"
        duplicate = 1
        while os.path.exists(archive_path):
            duplicate += 1
            archive_path = f"opensubtitles-{first_num}-{last_num}.{duplicate}.{file_extension}"
            if suffix_before_duplicate:
                archive_path = f"opensubtitles-{first_num}-{last_num}-{suffix_before_duplicate}.{duplicate}.{file_extension}"
        return archive_path
    if output_format == "tar":
        # note: uncompressed tar, because content is compressed (zip files)
        file_extension = "tar"
        archive_path = get_archive_path(first_num, last_num, file_extension)
        pack_files_tar(archive_path, sum_files)
        return archive_path
    if output_format == "iso":
        file_extension = "iso"
        archive_path = get_archive_path(first_num, last_num, file_extension)
        volid = f"OPENSUBTITLES_{first_num}_{last_num}"
        pack_files_iso(archive_path, sum_files, volid)
        return archive_path
    if output_format == "udf":
        # mkudffs creates pure UDF, so we use extension "udf"
        file_extension = "udf"
        archive_path = get_archive_path(first_num, last_num, file_extension)
        label = f"opensubtitles-{first_num}-{last_num}"
        #group_label = f"opensubtitles"
        pack_files_udf(archive_path, sum_files, label, sum_size)
        return archive_path
    if output_format == "udf-pycdlib":
        # pycdlib creates impure UDF, so we use extension "iso"
        file_extension = "iso"
        archive_path = get_archive_path(first_num, last_num, file_extension)
        label = f"opensubtitles-{first_num}-{last_num}"
        #group_label = f"opensubtitles"
        pack_files_udf_pycdlib(archive_path, sum_files, label, sum_size)
        return archive_path
    if output_format == "sqlite":
        table_name = "zipfiles"
        file_extension = "db" # short, ambiguous
        #file_extension = "sqlite" # explicit, also used by archive.org for metadata
        if sqlite_group_by_language:
            files_by_lang = dict()
            for filepath in sum_files:
                # parse lang from filename
                # 000000001.alien.3.(1992).eng.2cd.zip
                # FIXME filepath is a bad source for language
                #   the filepath-languages can be wrong or missing ("und" = undefined language)
                #   instead, use metadata from subtitles_all.db
                # TODO maybe make a new release to replace opensubtitles.org.dump.9180519.to.9521948.by.lang.2023.04.26
                #   to fix the language groups. add a migrate.py script so peers can fix their files
                lang = filepath.split(".")[-3]
                assert re.match(r"^[a-z]{3}$", lang)
                if not lang in files_by_lang:
                    files_by_lang[lang] = list()
                files_by_lang[lang].append(filepath)
            archive_paths = []
            for lang in files_by_lang:
                archive_path = get_archive_path(first_num, last_num, file_extension, lang)
                lang_files = files_by_lang[lang]
                pack_files_sqlite(archive_path, lang_files, table_name)
                archive_paths.append(archive_path)
            return archive_paths
        else:
            archive_path = get_archive_path(first_num, last_num, file_extension)
            pack_files_sqlite(archive_path, sum_files, table_name)
            return archive_path
    #elif output_format == "fat32":
    #    archive_path = f"opensubtitles-{first_num}-{last_num}.fat32"
    #    pack_files_fat32(archive_path, sum_files, sum_size)
    assert False, f"unknown output_format: {output_format}"


def pack_files_tar(archive_path, sum_files):
    print(f"packing {len(sum_files)} files to {archive_path}")
    sum_files_file = "new-subs-archive.py-sum_files.txt"
    with open(sum_files_file, "w") as f:
        f.write("\n".join(sum_files) + "\n")
    args = [
        "tar",
        # all these options are required to create reproducible archives
        # https://reproducible-builds.org/docs/archives/
        # TODO create reproducible archives with python tarfile
        # so this also works on windows
        "--format=gnu",
        "--sort=name", # sort filenames, independent of locale. tar v1.28
        "--mtime=0",
        "--owner=0",
        "--group=0",
        "--numeric-owner",
        "-c",
        "-f", archive_path,
        "-T", sum_files_file,
    ]
    subprocess.run(
        args,
        check=True,
    )


def pack_files_sqlite(db_path, sum_files, table_name, page_size=None):
    print(f"creating database {db_path} ...")
    t1 = time.time()
    assert os.path.exists(db_path) == False, f"error: output file exists: {db_path}"
    con = sqlite3.connect(db_path)
    cur = con.cursor()
    if page_size == None:
        page_size = sqlite_page_size
    cur.executescript(f"PRAGMA page_size = {sqlite_page_size}; VACUUM;")
    cur.execute("PRAGMA count_changes=OFF")
    cur.execute(
        f"CREATE TABLE {table_name} (\n"
        f"  num INTEGER PRIMARY KEY,\n"
        f"  name TEXT,\n"
        f"  content BLOB\n"
        f")"
    )
    """
    # no. store missing numbers as text files
    cur.execute(
        f"CREATE TABLE missing_404 (\n"
        f"  num INTEGER PRIMARY KEY\n"
        f")"
    )
    sql_query = f"INSERT INTO missing_404 (num) VALUES (?)"
    # ...
    cur.execute(
        f"CREATE TABLE missing_dcma (\n"
        f"  num INTEGER PRIMARY KEY\n"
        f")"
    )
    sql_query = f"INSERT INTO missing_dcma (num) VALUES (?)"
    # ...
    """
    #sql_query = f"INSERT INTO {table_name} (num, name, content) VALUES (?, ?, ?)"
    sql_query = f"INSERT INTO {table_name} VALUES (?, ?, ?)"
    for file_path in sum_files:
        file_name = os.path.basename(file_path)
        name_parts = file_name.split(".")
        num = int(name_parts[0])
        assert name_parts[-1] == "zip", f"not a zip file: {file_path}"
        # check for legacy file format before new-subs-rename-remove-num-part.py
        assert name_parts[-2] != f"({num})", f"bad filename format: {file_path}"
        name = ".".join(name_parts[1:-1])
        # too complex
        # store only files here
        # and use a separate DB for all metadata
        #lang = name_parts[-3]
        #assert re.match(r"^[a-z]{3}$", lang)
        with open(file_path, "rb") as f:
            content = f.read()
        sql_args = (num, name, content)
        cur.execute(sql_query, sql_args)
    con.commit()
    con.close()
    t2 = time.time()
    print(f"creating database {db_path} done in {t2 - t1} seconds")


def pack_files_iso(iso_image_path, sum_files, volid):
    """
    ignore this error? ISO seems fine. later: error is gone.
    FIXME fails to create large iso of 1GB
    libisofs: FATAL : Image is most likely damaged. Calculated/written tree end address mismatch.
    libisofs: FATAL : Image is most likely damaged. Calculated/written image end address mismatch.
    libburn : FAILURE : Premature end of input encountered. Missing: 2048 bytes
    """
    t1 = time.time()
    print(f"packing {len(sum_files)} files to {iso_image_path}")
    print(f"creating image {iso_image_path} ...")
    sum_files_file = "new-subs-archive.py-sum_files.txt"
    with open(sum_files_file, "w") as f:
        f.write("\n".join(sum_files) + "\n")
    # TODO is this reproducible?
    assert re.match(r"^[A-Z0-9_]{0,32}$", volid), f"invalid volid: {repr(volid)}"
    # note: xorriso does not produce UDF filesystems
    args = [
        #"mkisofs",
        "xorrisofs", # mkisofs compatibility mode of xorriso
        "--modification-date=1970010100000000", # YYYYMMDDhhmmsscc
        "--set_all_file_dates", "set_to_mtime",
        "-uid", "0",
        "-gid", "0",
        "-volid", volid,
        #"--gpt_disk_guid", "modification-date",
        "--gpt_disk_guid", "00000000000000000000000000000000",
        "-no-cache-inodes", # we have no hardlinks
        "-dir-mode", "0755",
        "-file-mode", "0644", # we have no executable files
        # To create reproducible ISO-9660 filesystem images,
        # the options: -creation-date, -effective-date, -modification-date and -noatime need to be specified
        # and the -o option must not be used.
        "-output", iso_image_path, # If not specified, stdout is used.
        "-input-charset", "utf8",
        "-preparer", "", # default: XORRISO-1.5.4 2021.02.06.123001, LIBISOBURN-1.5.4, LIBISOFS-1.5.4, LIBBURN-1.5.4
        # TODO how to set file paths in the image?
        # all files are written to the root directory
        "-path-list", sum_files_file,
        # Allow more than one dot in filenames (e.g. .tar.gz) (violates ISO9660)
        # ignored by xorrisofs
        #"-allow-multidot",
    ]
    try:
        proc = subprocess.run(
            args,
            check=True,
            env={
                "PATH": os.environ["PATH"],
                "SOURCE_DATE_EPOCH": "0", # for xorriso
            },
            # capture output because xorrisofs is too verbose
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            encoding="utf8",
        )
    except subprocess.CalledProcessError as err:
        print(f"creating image {iso_image_path} done with error")
        print(f"xorrisofs output:")
        print(proc.stdout)
        print()
    t2 = time.time()
    dt = t2 - t1
    print(f"creating image {iso_image_path} done in {dt} seconds")
    assert os.path.exists(iso_image_path), f"xorrisofs failed to create image: {iso_image_path}"
    # no. this takes long and requires root privileges
    # easier to create two identical images and assert equality
    # see repeat_count
    check_files = False
    if check_files:
        # check md5sum of all files
        t1 = time.time()
        print(f"checking files in {iso_image_path} ...")
        mount_dir = "new-subs-archive.py-tmp-mnt"
        os.makedirs(mount_dir, exist_ok=True)
        mount_iso_image(iso_image_path, mount_dir)
        for idx, src_file_path in enumerate(sum_files):
            if idx % 1000 == 0:
                print(f"progress: done {idx} of {len(sum_files)} files = {idx/len(sum_files)*100:.1f}%")
            with open(src_file_path, "rb") as f:
                expected_md5 = hashlib.md5(f.read()).hexdigest()
            dst_file_path = mount_dir + "/" + os.path.basename(src_file_path)
            with open(dst_file_path, "rb") as f:
                actual_md5 = hashlib.md5(f.read()).hexdigest()
            if actual_md5 != expected_md5:
                # cleanup
                unmount_dir(mount_dir)
                raise Exception(f"failed to verify file: {src_file_path} - expected md5: {expected_md5} - actual md5: {actual_md5}")
        unmount_dir(mount_dir)
        t2 = time.time()
        dt = t2 - t1
        print(f"checking files in {iso_image_path} done in {dt} seconds")


def pack_files_udf(output_path, sum_files, label, sum_size):
    udf_image_path = output_path
    print(f"packing {len(sum_files)} files to {output_path}")
    sum_files_file = "new-subs-archive.py-sum_files.txt"
    with open(sum_files_file, "w") as f:
        f.write("\n".join(sum_files) + "\n")
    # https://en.wikipedia.org/wiki/Universal_Disk_Format
    # Max. volume size:
    #   2 TiB (with 512-byte sectors)
    #   8 TiB (with 2 KiB sectors, like most optical discs)
    #   16 TiB (with 4 KiB sectors)
    # Max. filename length	255 bytes (path 1023 bytes)
    # note: dont use "genisoimage -udf" or "mkisofs -udf"
    # as they do not create a "pure UDF" filesystem
    # https://askubuntu.com/questions/1152527/creating-a-pure-udf-iso
    blocksize = 512
    #blocksize = 2048 # TODO?
    #blocksize = 4096 # TODO?
    blocks_count = math.ceil((1 + size_tolerance_udf) * sum_size / blocksize)
    blocks_count = max(blocks_count, udf_min_blocks_count)
    # FIXME OSError: [Errno 28] No space left on device
    blocks_count = 10 * blocks_count
    print(f"creating UDF filesystem of {blocks_count} * {blocksize} = {blocks_count * blocksize} bytes")

    # TODO is this reproducible?

    # create empty image file
    create_empty_udf_image(udf_image_path, udf_min_blocks_count, label)

    # unmount previously mounted image
    try:
        unmount_dir(mount_dir)
    except subprocess.CalledProcessError:
        pass

    # mount image file
    os.makedirs(mount_dir, exist_ok=True)
    mount_udf_image(udf_image_path, mount_dir)

    print(f"writing files to UDF image")
    for zipfile_path in sum_files:
        zipfile_name = os.path.basename(zipfile_path)
        dst_path = f"{mount_dir}/{zipfile_name}"
        # Copy the contents (no metadata)
        shutil.copyfile(zipfile_path, dst_path)
        # TODO subprocess.run(["faketime", "asdf", "cp", zipfile_path, dst_path])
    unmount_dir(mount_dir)
    os.rmdir(mount_dir)


def pack_files_udf_pycdlib(output_path, sum_files, label, sum_size):
    udf_image_path = output_path
    print(f"packing {len(sum_files)} files to {output_path}")
    iso = pycdlib.PyCdlib()
    udf_version = "2.60" # only supported UDF version in pycdlib
    iso.new(udf=udf_version)
    #foostr = b'foo\n'
    #iso.add_fp(BytesIO(foostr), len(foostr), '/FOO.;1', udf_path='/foo')
    #iso.add_directory('/DIR1', udf_path='/dir1')
    file_handles = []
    for input_path in sum_files:
        size = os.path.getsize(input_path)
        basename = os.path.basename(input_path)
        udf_path = "/" + basename
        # ISO9660 filenames at interchange level 1 cannot have more than 8 characters or 3 characters in the extension
        iso_path = "/" + basename.split(".", 1)[0]
        # note: must keep file_handle until after iso.close()
        file_handle = open(input_path, "rb")
        iso.add_fp(file_handle, size, iso_path, udf_path=udf_path)
        file_handles.append(file_handle)
    iso.write(output_path)
    iso.close()
    for file_handle in file_handles:
        file_handle.close()


print(f"output_format: {output_format}")


print()
print("checking tools ...")

if output_format == "iso":
    proc = subprocess.run(
        ["xorrisofs"],
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        encoding="utf8",
    )
    first_line = proc.stdout.lstrip().split("\n", 1)[0]
    print(f"found xorrisofs: {first_line}")

# we need root privileges to mount images
if output_format == "udf":
    test_mount_udf()
#if output_format == "iso":
#    test_mount_iso()

print("checking tools done")


sum_size = 0
sum_files = []

max_size_tolerant = max_size * (1 - size_tolerance)


print(f"output_format {output_format}")
print(f"max_size {max_size}")
print(f"max_size_tolerant {max_size_tolerant}")


if output_format == "iso":
    print("calling new-subs-hardlink-num.py ...")
    args = [
        sys.executable,
        "new-subs-hardlink-num.py",
    ]
    t1 = time.time()
    subprocess.run(
        args,
        check=True,
    )
    t2 = time.time()
    print(f"calling new-subs-hardlink-num.py done in {t2 - t1} seconds")


print()
print(f"collecting input files from {input_dir_short_filenames}")
t1 = time.time()
#sorted_short_names = natsort.natsorted(glob.glob("*.zip", root_dir=input_dir_short_filenames))
short_names = glob.glob("*.zip", root_dir=input_dir_short_filenames)
t2 = time.time()

print(f"collecting input files from {input_dir_long_filenames}")
t1 = time.time()
#sorted_long_names = natsort.natsorted(glob.glob("*.zip", root_dir=input_dir_long_filenames))
long_names = glob.glob("*.zip", root_dir=input_dir_long_filenames)
t2 = time.time()

input_numbers_list = []
print(f"creating input_numbers_list")
#for short_name in short_names:
#    num = int(short_name[0:-4])
for long_name in long_names:
    num = int(long_name.split(".", 1)[0])
    input_numbers_list.append(num)
input_numbers_list = sorted(input_numbers_list)
print(f"creating input_numbers_list done {len(input_numbers_list)}")


# all nums are either in
# f"{input_dir_long_filenames}/{num}.not-found"
# or
# f"{input_dir_short_filenames}/{num}.zip"
# see also: missing_files in fetch-subs.py
"""
print("checking for missing files")
first_num = input_numbers_list[0]
last_num = input_numbers_list[-1]
has_missing_files = False
for num in range(first_num, last_num + 1):
    if os.path.exists(f"{input_dir_short_filenames}/{num}.zip"):
        continue
    if os.path.exists(f"{input_dir_long_filenames}/{num}.not-found"):
        continue
    if has_missing_files == False:
        print("missing numbers:")
    print(num)
    has_missing_files = True
assert has_missing_files == False, "error: missing files"
raise NotImplementedError
"""


print(f"creating long_names_dict")
long_names_dict = {}
for long_name in long_names:
    num = int(long_name.split(".", 1)[0])
    long_names_dict[num] = long_name
print(f"creating long_names_dict done {len(long_names_dict)}")


print()
print(f"processing {len(input_numbers_list)} files ...")
sum_names_long = []
for num in input_numbers_list:
    if num < continue_from:
        #print(f"skip num {num}")
        continue
    short_name = f"{num}.zip" # trivial
    try:
        long_name = long_names_dict[num]
    except KeyError:
        # bug in new-subs-hardlink-num.py
        # ".zip" should be ".not-found"
        # new-subs/9331545.not-found
        # new-subs-num/9331545.zip
        print(f"no long name for num {num}")
        continue
    zip_file = None
    if use_short_filenames:
        zip_file = f"{input_dir_short_filenames}/{short_name}"
    else:
        zip_file = f"{input_dir_long_filenames}/{long_name}"
    size = os.path.getsize(zip_file)
    #print(size, short_name)
    if size == 0: # legacy. these files should be named f"{num}.not-found"
        continue
    sum_size += size
    if sum_size < max_size_tolerant:
        sum_files.append(zip_file)
        #print(f"adding file {zip_file}")
        if store_filenames:
            sum_names_long.append(long_name)
    else:
        sum_size -= size

        # add filenames.txt
        if store_filenames:
            tempdir = "new-subs-archive.py-tempdir"
            os.makedirs(tempdir, exist_ok=True)
            filenames_txt_path = f"{tempdir}/filenames.txt"
            with open(filenames_txt_path, "w") as f:
                f.write("\n".join(sum_names_long) + "\n")
            sum_files.append(filenames_txt_path)
            sum_size += os.path.getsize(filenames_txt_path)

        print()
        print(f"packing {len(sum_files)} files at sum_size {sum_size}")
        pack_files(sum_files, sum_size)

        # reset
        sum_size = 0
        sum_files = []
        sum_names_long = []

        # continue
        sum_size += size
        sum_files.append(zip_file)
        if store_filenames:
            sum_names_long.append(long_name)


print()
print("NOTE: packing last archive")


print()
print(f"packing {len(sum_files)} files at sum_size {sum_size}")
pack_files(sum_files, sum_size)