bin/mtag/yt-ipr.py

#!/usr/bin/env python

import re
import os
import sys
import gzip
import json
import base64
import string
import urllib.request
from datetime import datetime

"""
youtube initial player response

it's probably best to use this through a config file; see res/yt-ipr.conf

but if you want to use plain arguments instead then:
  -v srv/ytm:ytm:w:rw,ed
       :c,e2ts,e2dsa
       :c,sz=16k-1m:c,maxn=10,300:c,rotf=%Y-%m/%d-%H
       :c,mtp=yt-id,yt-title,yt-author,yt-channel,yt-views,yt-private,yt-manifest,yt-expires=bin/mtag/yt-ipr.py
       :c,mte=yt-id,yt-title,yt-author,yt-channel,yt-views,yt-private,yt-manifest,yt-expires

see res/yt-ipr.user.js for the example userscript to go with this
"""


def main():
    try:
        with gzip.open(sys.argv[1], "rt", encoding="utf-8", errors="replace") as f:
            txt = f.read()
    except:
        with open(sys.argv[1], "r", encoding="utf-8", errors="replace") as f:
            txt = f.read()

    txt = "{" + txt.split("{", 1)[1]

    try:
        pd = json.loads(txt)
    except json.decoder.JSONDecodeError as ex:
        pd = json.loads(txt[: ex.pos])

    # print(json.dumps(pd, indent=2))

    if "videoDetails" in pd:
        parse_youtube(pd)
    else:
        parse_freg(pd)


def get_expiration(url):
    et = re.search(r"[?&]expire=([0-9]+)", url).group(1)
    et = datetime.utcfromtimestamp(int(et))
    return et.strftime("%Y-%m-%d, %H:%M")


def parse_youtube(pd):
    vd = pd["videoDetails"]
    sd = pd["streamingData"]

    et = sd["adaptiveFormats"][0]["url"]
    et = get_expiration(et)

    mf = []
    if "dashManifestUrl" in sd:
        mf.append("dash")
    if "hlsManifestUrl" in sd:
        mf.append("hls")

    r = {
        "yt-id": vd["videoId"],
        "yt-title": vd["title"],
        "yt-author": vd["author"],
        "yt-channel": vd["channelId"],
        "yt-views": vd["viewCount"],
        "yt-private": vd["isPrivate"],
        # "yt-expires": sd["expiresInSeconds"],
        "yt-manifest": ",".join(mf),
        "yt-expires": et,
    }
    print(json.dumps(r))

    freg_conv(pd)


def parse_freg(pd):
    md = pd["metadata"]
    r = {
        "yt-id": md["id"],
        "yt-title": md["title"],
        "yt-author": md["channelName"],
        "yt-channel": md["channelURL"].strip("/").split("/")[-1],
        "yt-expires": get_expiration(list(pd["video"].values())[0]),
    }
    print(json.dumps(r))


def freg_conv(pd):
    # based on getURLs.js v1.5 (2021-08-07)
    # fmt: off
    priority = {
        "video": [
            337, 315, 266, 138,  # 2160p60
            313, 336,  # 2160p
            308,  # 1440p60
            271, 264,  # 1440p
            335, 303, 299,  # 1080p60
            248, 169, 137,  # 1080p
            334, 302, 298,  # 720p60
            247, 136  # 720p
        ],
        "audio": [
            251, 141, 171, 140, 250, 249, 139
        ]
    }

    vid_id = pd["videoDetails"]["videoId"]
    chan_id = pd["videoDetails"]["channelId"]

    try:
        thumb_url = pd["microformat"]["playerMicroformatRenderer"]["thumbnail"]["thumbnails"][0]["url"]
        start_ts = pd["microformat"]["playerMicroformatRenderer"]["liveBroadcastDetails"]["startTimestamp"]
    except:
        thumb_url = f"https://img.youtube.com/vi/{vid_id}/maxresdefault.jpg"
        start_ts = ""

    # fmt: on

    metadata = {
        "title": pd["videoDetails"]["title"],
        "id": vid_id,
        "channelName": pd["videoDetails"]["author"],
        "channelURL": "https://www.youtube.com/channel/" + chan_id,
        "description": pd["videoDetails"]["shortDescription"],
        "thumbnailUrl": thumb_url,
        "startTimestamp": start_ts,
    }

    if [x for x in vid_id if x not in string.ascii_letters + string.digits + "_-"]:
        print(f"malicious json", file=sys.stderr)
        return

    basepath = os.path.dirname(sys.argv[1])

    thumb_fn = f"{basepath}/{vid_id}.jpg"
    tmp_fn = f"{thumb_fn}.{os.getpid()}"
    if not os.path.exists(thumb_fn) and (
        thumb_url.startswith("https://img.youtube.com/vi/")
        or thumb_url.startswith("https://i.ytimg.com/vi/")
    ):
        try:
            with urllib.request.urlopen(thumb_url) as fi:
                with open(tmp_fn, "wb") as fo:
                    fo.write(fi.read())

            os.rename(tmp_fn, thumb_fn)
        except:
            if os.path.exists(tmp_fn):
                os.unlink(tmp_fn)

    try:
        with open(thumb_fn, "rb") as f:
            thumb = base64.b64encode(f.read()).decode("ascii")
    except:
        thumb = "/9j/4AAQSkZJRgABAQEASABIAAD/2wBDAAMCAgICAgMCAgIDAwMDBAYEBAQEBAgGBgUGCQgKCgkICQkKDA8MCgsOCwkJDRENDg8QEBEQCgwSExIQEw8QEBD/yQALCAABAAEBAREA/8wABgAQEAX/2gAIAQEAAD8A0s8g/9k="

    metadata["thumbnail"] = "data:image/jpeg;base64," + thumb

    ret = {
        "metadata": metadata,
        "version": "1.5",
        "createTime": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
    }

    for stream, itags in priority.items():
        for itag in itags:
            url = None
            for afmt in pd["streamingData"]["adaptiveFormats"]:
                if itag == afmt["itag"]:
                    url = afmt["url"]
                    break

            if url:
                ret[stream] = {itag: url}
                break

    fn = f"{basepath}/{vid_id}.urls.json"
    with open(fn, "w", encoding="utf-8", errors="replace") as f:
        f.write(json.dumps(ret, indent=4))


if __name__ == "__main__":
    try:
        main()
    except:
        # raise
        pass