opensubs-metadata-dump-json.py

#!/usr/bin/env python3

import sys
import os
import re
import time
import sqlite3
import json
import subprocess
import shlex
import shutil
#import csv

# generated by subtitles_all.txt.gz-parse.py
db_path = "opensubs-metadata.db"

output_dir = "opensubtitles-scraper-sub-dates"
filename_base = "sub-dates.jsonl"

output_path_base = f"{output_dir}/{filename_base}"

#part_size = 50000 # 1 part = 1 MByte, 120 parts
part_size = 100000 # 1 part = 2 MByte, 60 parts
part_name = "100k"
#part_size = 250000 # 1 part = 5 MByte, 24 parts

# verbose
debug_print = print
# quiet
#debug_print = lambda _: None

# subtitles_all.txt.gz-parse.py
column_names = [
    # zcat subtitles_all.txt.gz | head -n1 | tr '\t' '\n' | grep -n . | sed -E 's/^([0-9]+):(.*)$/"\2", # \1/'
    "IDSubtitle",
    "MovieName", # redundant with IMDB titles
    "MovieYear", # redundant with IMDB titles
    #"MovieNameIfNoImdbId", # 10% smaller
    #"MovieYearIfNoImdbId",
    #"LanguageName", # 4 # redundant with ISO639 # 5% smaller
    "ISO639",
    #"SubAddDate",
    "SubAddDateUnix", # 5% smaller
    "ImdbID",
    #"SubFormat",
    #"SubSumCD",
    #"MovieReleaseName", # redundant with IMDB titles/aliases # 30% smaller
    #"MovieFPS", # not needed? FPS is specified in *.sub files
    "SeriesSeason",
    "SeriesEpisode",
    "SeriesIMDBParent",
    #"MovieKind",
    "MovieKindIsMovie", # derived from MovieKind # 2% smaller
    #"URL", # redundant with IDSubtitle
]

column_names = [
    "IDSubtitle",
    "SubAddDateUnix",
]

assert column_names[0] == "IDSubtitle" # num = row[0]

column_sql = {
    "MovieNameIfNoImdbId": "CASE WHEN ImdbID = 0 THEN MovieName ELSE '' END",
    "MovieYearIfNoImdbId": "CASE WHEN ImdbID = 0 THEN MovieYear ELSE 0 END",
    "SubAddDateUnix": "unixepoch(SubAddDate)",
    "MovieKindIsMovie": "CASE WHEN MovieKind = 'movie' THEN 1 ELSE 0 END",
}

column_types = {
    "IDSubtitle": int,
    "MovieName": str,
    "MovieNameIfNoImdbId": str,
    "MovieYear": int,
    "MovieYearIfNoImdbId": int,
    "LanguageName": str,
    "ISO639": str,
    "SubAddDate": str,
    "SubAddDateUnix": int,
    "ImdbID": int,
    "SubFormat": str,
    "SubSumCD": int,
    "MovieReleaseName": str,
    "MovieFPS": float,
    "SeriesSeason": int,
    "SeriesEpisode": int,
    "SeriesIMDBParent": int,
    "MovieKind": str,
    "MovieKindIsMovie": int,
    "URL": str,
}

# TODO better?
type_name = {
    int: "int",
    float: "float",
    str: "str",
}


if output_path_base.endswith(".jsonl"):
    # store schema in separate file -> column names and types
    output_columns_path = ".".join(output_path_base.split(".")[0:-1]) + ".columns.json"
    #assert os.path.exists(output_columns_path) == False, f"error: output file exists: {output_columns_path}"
    print(f"writing {output_columns_path} ...")
    output_columns = []
    for name in column_names:
        column = {
            "name": name,
            "type": str(type_name[column_types[name]]),
        }
        if name in column_sql:
            column["sql"] = column_sql[name]
        output_columns.append(column)
    with open(output_columns_path, "w") as dst:
        json.dump(output_columns, dst, indent=2)


#assert os.path.exists(output_path_base) == False, f"error: output file exists: {output_path_base}"

con = sqlite3.connect(db_path)

"""
# get column names
cur = con.cursor()
cur.row_factory = sqlite3.Row
sql_query = "SELECT * FROM subz_metadata LIMIT 1"
row = cur.execute(sql_query).fetchone()
column_names = list(row.keys())
"""

cur = con.cursor()
column_names_sql = []
for name in column_names:
    if name in column_sql:
        column_names_sql.append(f"{column_sql[name]} AS {name}")
    else:
        column_names_sql.append(name)

sql_query = f"SELECT {', '.join(column_names_sql)} FROM subz_metadata"
print(f"sql_query: {sql_query}")

# hole in dataset between 242445 and 3080254 = 2.5M missing nums
allow_empty_before = 3080254

# loop intervals
part_idx = -1
while True:
    part_idx += 1
    part_first = part_size * part_idx
    part_last = part_first + part_size - 1

    sql_query_part = sql_query + f" WHERE IDSubtitle BETWEEN {part_first} AND {part_last}"
    # "ORDER BY" must come after "WHERE"
    sql_query_part += " ORDER BY IDSubtitle ASC"
    path_parts = output_path_base.split(".")
    #output_path_part = ".".join(path_parts[0:-1]) + f".count.{part_size}.from.{part_first}.{path_parts[-1]}"
    #output_path_part = ".".join(path_parts[0:-1]) + f".count.{part_size}.part.{part_idx}.{path_parts[-1]}"
    output_path_part = ".".join(path_parts[0:-1]) + f".{part_name}.{part_idx}.{path_parts[-1]}"

    if os.path.exists(output_path_part):
        print(f"exists {output_path_part}")

    else:
        # json is typed: str, int, float, bool, NoneType
        if output_path_part.endswith(".jsonl"):
            with open(output_path_part, "w") as dst:
                for row in cur.execute(sql_query_part):
                    line = json.dumps(
                        row,
                        indent=None,
                        separators=(',', ':'),
                        #separators=(', ', ': '),
                    )
                    dst.write(line + "\n")

            if os.path.getsize(output_path_part) > 0:
                print(f"done {output_path_part}")
            else:
                print(f"empty {output_path_part}")
                os.unlink(output_path_part)
                if allow_empty_before < part_first:
                    print("done all")
                    break

        # no. csv is untyped, every value is a string
        elif False and output_path_base.endswith(".csv"):
            with open(output_path_base, "w") as dst:
                writer = csv.writer(dst)
                for row in cur.execute(sql_query):
                    num = row[0]
                    if num % 100000 == 0:
                        print(f"done {part_idx}")
                    writer.writerow(row)

        else:
            print(f"error: unknown file extension in output file: {output_path_base}")

    if os.path.exists(output_path_part) == False:
        # output file was empty
        continue


    # add to git

    filename = os.path.basename(output_path_part)
    print(f"git add {filename}")

    worktree_path = f"{output_dir}/parts/{part_idx}"

    if os.path.exists(worktree_path):
        # remove old worktree
        args = [
            "git",
            "-C", output_dir,
            "worktree",
            "remove",
            #"--force",
            f"parts/{part_idx}", # worktree path
        ]
        debug_print(shlex.join(args))
        proc = subprocess.run(
            args,
            check=True,
            timeout=10,
        )

    args = [
        "git",
        "-C", output_dir,
        "worktree",
        "add",
        "--quiet",
        "--detach",
        "--no-checkout",
        f"parts/{part_idx}", # worktree path
    ]
    debug_print(shlex.join(args))
    proc = subprocess.run(
        args,
        check=True,
        timeout=10,
    )

    args = [
        "git",
        "-C", worktree_path,
        "checkout",
        "--quiet",
        "--orphan",
        f"parts/{part_idx}", # branch name
    ]
    debug_print(shlex.join(args))
    proc = subprocess.run(
        args,
        check=True,
        timeout=10,
    )

    args = [
        "git",
        "-C", worktree_path,
        "reset",
    ]
    debug_print(shlex.join(args))
    proc = subprocess.run(
        args,
        check=True,
        timeout=10,
    )

    args = [
        "git",
        "-C", worktree_path,
        "clean",
        "-fdq",
    ]
    debug_print(shlex.join(args))
    proc = subprocess.run(
        args,
        check=True,
        timeout=10,
    )

    # copy file to worktree
    shutil.copyfile(
        f"{output_dir}/{filename}",
        f"{output_dir}/parts/{part_idx}/{filename}",
    )

    args = [
        "git",
        "-C", worktree_path,
        "add",
        filename
    ]
    debug_print(shlex.join(args))
    proc = subprocess.run(
        args,
        check=True,
        timeout=10,
    )

    if False:
        # disable compression for zip files
        gitattributes_path = f"{output_dir}/parts/{part_idx}/.gitattributes"
        # https://stackoverflow.com/questions/7102053/git-pull-without-remotely-compressing-objects
        with open(gitattributes_path, "w") as f:
            f.write("*.zip -delta\n")
        args = [
            "git",
            "-C", worktree_path,
            "add",
            os.path.basename(gitattributes_path),
        ]
        debug_print(shlex.join(args))
        proc = subprocess.run(
            args,
            check=True,
            timeout=10,
        )

    args = [
        "git",
        "-C", worktree_path,
        "commit",
        "--quiet",
        "-m", f"add {part_idx}",
        #"-m", f"add part {part_idx}",
    ]
    debug_print(shlex.join(args))
    proc = subprocess.run(
        args,
        check=True,
        timeout=10,
    )

    args = [
        "git",
        "-C", output_dir,
        "worktree",
        "remove",
        f"parts/{part_idx}", # worktree path
    ]
    debug_print(shlex.join(args))
    proc = subprocess.run(
        args,
        check=True,
        timeout=10,
    )

    os.unlink(f"{output_dir}/{filename}")