From 79325a21020aa34fa9df836837cdd3bc6c174ade Mon Sep 17 00:00:00 2001
From: Simon Alibert <alibert.sim@gmail.com>
Date: Mon, 24 Jun 2024 15:33:06 +0000
Subject: [PATCH] Add vcodec arg

---
 benchmark/video/run_video_benchmark.py | 85 +++++++++++++-------------
 1 file changed, 43 insertions(+), 42 deletions(-)

diff --git a/benchmark/video/run_video_benchmark.py b/benchmark/video/run_video_benchmark.py
index bcbf4223c..9b96207ee 100644
--- a/benchmark/video/run_video_benchmark.py
+++ b/benchmark/video/run_video_benchmark.py
@@ -178,7 +178,7 @@ def process_sample(t):
 
         with time_benchmark:
             frames = decode_video_frames_torchvision(
-                video_path, timestamps=timestamps, tolerance_s=1e-4, backend=backend
+                video_path, timestamps=timestamps, tolerance_s=2e-1, backend=backend
             )
         result["load_time_video_ms"] = time_benchmark.result_ms / num_frames
 
@@ -304,7 +304,7 @@ def benchmark_encoding_decoding(
 def main(
     output_dir: Path,
     repo_ids: list[str],
-    # vcodec: list[str],
+    vcodec: list[str],
     pix_fmt: list[str],
     g: list[int],
     crf: list[int],
@@ -317,7 +317,7 @@ def main(
     check_datasets_formats(repo_ids)
     encoding_benchmarks = {
         # "vcodec": vcodec,
-        "pix_fmt": pix_fmt,
+        # "pix_fmt": pix_fmt,
         "g": g,
         "crf": crf,
     }
@@ -325,33 +325,9 @@ def main(
         "timestamps_modes": timestamps_modes,
         "backends": backends,
     }
-    benchmark_table = []
-    for repo_id in tqdm(repo_ids, desc="datasets"):
-        dataset = LeRobotDataset(repo_id)
-        imgs_dir = output_dir / "images" / dataset.repo_id.replace("/", "_")
-        # We only use the first episode
-        save_first_episode(imgs_dir, dataset)
-
-        for key, values in tqdm(encoding_benchmarks.items(), desc="encodings", leave=False):
-            for value in tqdm(values, desc=f"encodings ({key})", leave=False):
-                encoding_cfg = BASE_ENCODING.copy()
-                encoding_cfg[key] = value
-                args_path = Path("_".join(str(value) for value in encoding_cfg.values()))
-                video_path = output_dir / "videos" / args_path / f"{repo_id.replace('/', '_')}.mp4"
-                benchmark_table += benchmark_encoding_decoding(
-                    dataset,
-                    video_path,
-                    imgs_dir,
-                    encoding_cfg,
-                    decoding_benchmarks,
-                    num_samples,
-                    num_workers,
-                    save_frames,
-                )
-
-    columns_order = ["repo_id", "resolution", "num_pixels"]
-    columns_order += list(BASE_ENCODING.keys())
-    columns_order += [
+    headers = ["repo_id", "resolution", "num_pixels"]
+    headers += list(BASE_ENCODING.keys())
+    headers += [
         "video_size_bytes",
         "images_size_bytes",
         "video_images_size_ratio",
@@ -364,10 +340,36 @@ def main(
         "avg_psnr",
         "avg_ssim",
     ]
-    benchmark_df = pd.DataFrame(benchmark_table, columns=columns_order)
-    now = dt.datetime.now()
-    csv_path = output_dir / f"{now:%Y-%m-%d}_{now:%H-%M-%S}_{num_samples}-samples.csv"
-    benchmark_df.to_csv(csv_path, header=True, index=False)
+    for video_codec in tqdm(vcodec, desc="encodings (vcodec)", leave=False):
+        benchmark_table = []
+        for pixel_format in tqdm(pix_fmt, desc="encodings (pix_fmt)", leave=False):
+            for repo_id in tqdm(repo_ids, desc="encodings (datasets)"):
+                dataset = LeRobotDataset(repo_id)
+                imgs_dir = output_dir / "images" / dataset.repo_id.replace("/", "_")
+                # We only use the first episode
+                save_first_episode(imgs_dir, dataset)
+                for key, values in tqdm(encoding_benchmarks.items(), desc="encodings (g, crf)", leave=False):
+                    for value in tqdm(values, desc=f"encodings ({key})", leave=False):
+                        encoding_cfg = BASE_ENCODING.copy()
+                        encoding_cfg["vcodec"] = video_codec
+                        encoding_cfg["pix_fmt"] = pixel_format
+                        args_path = Path("_".join(str(value) for value in encoding_cfg.values()))
+                        video_path = output_dir / "videos" / args_path / f"{repo_id.replace('/', '_')}.mp4"
+                        benchmark_table += benchmark_encoding_decoding(
+                            dataset,
+                            video_path,
+                            imgs_dir,
+                            encoding_cfg,
+                            decoding_benchmarks,
+                            num_samples,
+                            num_workers,
+                            save_frames,
+                        )
+
+        benchmark_df = pd.DataFrame(benchmark_table, columns=headers)
+        now = dt.datetime.now()
+        csv_path = output_dir / f"{now:%Y-%m-%d}_{now:%H-%M-%S}_{video_codec}_{num_samples}-samples.csv"
+        benchmark_df.to_csv(csv_path, header=True, index=False)
 
 
 if __name__ == "__main__":
@@ -390,14 +392,13 @@ def main(
         ],
         help="Datasets repo-ids to test against. First episodes only are used. Must be images.",
     )
-    # TODO(aliberts): add "libaom-av1" (need to build ffmpeg with "--enable-libaom")
-    # parser.add_argument(
-    #     "--vcodec",
-    #     type=str,
-    #     nargs="*",
-    #     default=["libx264", "libaom-av1"],
-    #     help="Video codecs to be tested",
-    # )
+    parser.add_argument(
+        "--vcodec",
+        type=str,
+        nargs="*",
+        default=["libx264", "libx265", "libsvtav1"],
+        help="Video codecs to be tested",
+    )
     parser.add_argument(
         "--pix-fmt",
         type=str,