From a03f838419bc0847c4922401d22f6b649883e1ff Mon Sep 17 00:00:00 2001
From: Josh Allmann <joshua.allmann@gmail.com>
Date: Thu, 5 Mar 2020 23:30:56 +0000
Subject: [PATCH] ffmpeg: Persist filtergraph in between transcodes.

Re-initializing the filtergraph turns out to be expensive,
especially if GPUs are in the picture. We get around 30% more
performance by persisting the filtergraph.

In order to flush the filter, we cache the most recent audio/video
frame and feed those into the filter repeatedly with a sentinel
value set (`AVFrame.opque`). Once we receive a frame from the
filtergraph with the sentinel set, we know the filter has been
completely flushed.
---
 ffmpeg/lpms_ffmpeg.c | 73 +++++++++++++++++++++++++++++++-------------
 1 file changed, 52 insertions(+), 21 deletions(-)

diff --git a/ffmpeg/lpms_ffmpeg.c b/ffmpeg/lpms_ffmpeg.c
index 4b730fef4a..cff02c0090 100644
--- a/ffmpeg/lpms_ffmpeg.c
+++ b/ffmpeg/lpms_ffmpeg.c
@@ -9,8 +9,6 @@
 #include <libavutil/opt.h>
 #include <libavutil/pixdesc.h>
 
-#include <pthread.h>
-
 // Not great to appropriate internal API like this...
 const int lpms_ERR_INPUT_PIXFMT = FFERRTAG('I','N','P','X');
 const int lpms_ERR_FILTERS = FFERRTAG('F','L','T','R');
@@ -32,11 +30,12 @@ struct input_ctx {
   enum AVHWDeviceType hw_type;
   char *device;
 
-  int64_t next_pts_a, next_pts_v;
-
   // Decoder flush
   AVPacket *first_pkt;
   int flushed;
+
+  // Filter flush
+  AVFrame *last_frame_v, *last_frame_a;
 };
 
 struct filter_ctx {
@@ -47,6 +46,8 @@ struct filter_ctx {
   AVFilterContext *src_ctx;
 
   uint8_t *hwframes; // GPU frame pool data
+  int64_t flush_offset;
+  int flushed, flush_count;
 };
 
 struct output_ctx {
@@ -214,13 +215,14 @@ static void close_output(struct output_ctx *octx)
   }
   if (octx->vc && AV_HWDEVICE_TYPE_NONE == octx->hw_type) avcodec_free_context(&octx->vc);
   if (octx->ac) avcodec_free_context(&octx->ac);
-  free_filter(&octx->vf);
-  free_filter(&octx->af);
+  octx->af.flushed = octx->vf.flushed = 0;
 }
 
 static void free_output(struct output_ctx *octx) {
   close_output(octx);
   if (octx->vc) avcodec_free_context(&octx->vc);
+  free_filter(&octx->vf);
+  free_filter(&octx->af);
 }
 
 
@@ -736,6 +738,8 @@ static void free_input(struct input_ctx *inctx)
   }
   if (inctx->ac) avcodec_free_context(&inctx->ac);
   if (inctx->hw_device_ctx) av_buffer_unref(&inctx->hw_device_ctx);
+  if (inctx->last_frame_v) av_frame_free(&inctx->last_frame_v);
+  if (inctx->last_frame_a) av_frame_free(&inctx->last_frame_a);
 }
 
 static int open_video_decoder(input_params *params, struct input_ctx *ctx)
@@ -855,6 +859,10 @@ static int open_input(input_params *params, struct input_ctx *ctx)
   if (ret < 0) dd_err("Unable to open video decoder\n")
   ret = open_audio_decoder(params, ctx);
   if (ret < 0) dd_err("Unable to open audio decoder\n")
+  ctx->last_frame_v = av_frame_alloc();
+  if (!ctx->last_frame_v) dd_err("Unable to alloc last_frame_v");
+  ctx->last_frame_a = av_frame_alloc();
+  if (!ctx->last_frame_a) dd_err("Unable to alloc last_frame_a");
 
   return 0;
 
@@ -1025,6 +1033,7 @@ int process_out(struct input_ctx *ictx, struct output_ctx *octx, AVCodecContext
   goto proc_cleanup; \
 }
   int ret = 0;
+  int is_flushing = 0;
 
   if (!encoder) proc_err("Trying to transmux; not supported")
 
@@ -1043,15 +1052,24 @@ int process_out(struct input_ctx *ictx, struct output_ctx *octx, AVCodecContext
     ret = init_video_filters(ictx, octx);
     if (ret < 0) return lpms_ERR_FILTERS;
   }
+  // Start filter flushing process if necessary
+  if (!inf && !filter->flushed) {
+    // Set input frame to the last frame
+    // And increment pts offset by pkt_duration
+    // TODO It may make sense to use the expected output packet duration instead
+    int is_video = AVMEDIA_TYPE_VIDEO == ost->codecpar->codec_type;
+    AVFrame *frame = is_video ? ictx->last_frame_v : ictx->last_frame_a;
+    filter->flush_offset += frame->pkt_duration;
+    inf = frame;
+    inf->opaque = (void*)inf->pts; // value doesn't matter; just needs to be set
+    is_flushing = 1;
+  }
   if (inf) {
+    // Apply the offset from filter flushing, then reset for the next output
+    inf->pts += filter->flush_offset;
     ret = av_buffersrc_write_frame(filter->src_ctx, inf);
+    inf->pts -= filter->flush_offset;
     if (ret < 0) proc_err("Error feeding the filtergraph");
-  } else {
-    // We need to set the pts at EOF to the *end* of the last packet
-    // in order to avoid discarding any queued packets
-    int64_t next_pts = AVMEDIA_TYPE_VIDEO == ost->codecpar->codec_type ?
-      ictx->next_pts_v : ictx->next_pts_a;
-    av_buffersrc_close(filter->src_ctx, next_pts, AV_BUFFERSRC_FLAG_PUSH);
   }
 
   while (1) {
@@ -1066,6 +1084,15 @@ int process_out(struct input_ctx *ictx, struct output_ctx *octx, AVCodecContext
       if (inf) return ret;
       frame = NULL;
     } else if (ret < 0) proc_err("Error consuming the filtergraph\n");
+    if (frame && frame->opaque) {
+      // opaque being set means it's a flush packet
+      filter->flush_count++;
+      // don't set flushed flag in case this is a flush from a previous segment
+      if (is_flushing) filter->flushed = 1;
+      frame->opaque = NULL; // reset just to be sure
+      continue;
+    }
+    if (frame) frame->pts -= filter->flush_count;
     ret = encode(encoder, frame, octx, ost);
     av_frame_unref(frame);
     // For HW we keep the encoder open so will only get EAGAIN.
@@ -1167,8 +1194,6 @@ int transcode(struct transcode_thread *h,
       if (octx->vc) {
         ret = add_video_stream(octx, ictx);
         if (ret < 0) main_err("Unable to re-add video stream\n");
-        ret = init_video_filters(ictx, octx);
-        if (ret < 0) main_err("Unable to re-open video filter\n")
       } else fprintf(stderr, "no video stream\n");
 
       // re-attach audio encoder
@@ -1190,6 +1215,7 @@ int transcode(struct transcode_thread *h,
   while (1) {
     int has_frame = 0;
     AVStream *ist = NULL;
+    AVFrame *last_frame = NULL;
     av_frame_unref(dframe);
     ret = process_in(ictx, dframe, &ipkt);
     if (ret == AVERROR_EOF) break;
@@ -1204,20 +1230,25 @@ int transcode(struct transcode_thread *h,
       // width / height will be zero for pure streamcopy (no decoding)
       decoded_results->frames += dframe->width && dframe->height;
       decoded_results->pixels += dframe->width * dframe->height;
+      has_frame = has_frame && dframe->width && dframe->height;
+      if (has_frame) last_frame = ictx->last_frame_v;
+    } else if (AVMEDIA_TYPE_AUDIO == ist->codecpar->codec_type) {
+      has_frame = has_frame && dframe->nb_samples;
+      if (has_frame) last_frame = ictx->last_frame_a;
+    }
       if (has_frame) {
         int64_t dur = 0;
         if (dframe->pkt_duration) dur = dframe->pkt_duration;
-        else if (ist->avg_frame_rate.den) {
-          dur = av_rescale_q(1, av_inv_q(ist->avg_frame_rate), ist->time_base);
+        else if (ist->r_frame_rate.den) {
+          dur = av_rescale_q(1, av_inv_q(ist->r_frame_rate), ist->time_base);
         } else {
           // TODO use better heuristics for this; look at how ffmpeg does it
-          //fprintf(stderr, "Could not determine next pts; filter might drop\n");
+          fprintf(stderr, "Could not determine next pts; filter might drop\n");
         }
-        ictx->next_pts_v = dframe->pts + dur;
+        dframe->pkt_duration = dur;
+        av_frame_unref(last_frame);
+        av_frame_ref(last_frame, dframe);
       }
-    } else if (AVMEDIA_TYPE_AUDIO == ist->codecpar->codec_type) {
-      if (has_frame) ictx->next_pts_a = dframe->pts + dframe->pkt_duration;
-    }
 
     for (i = 0; i < nb_outputs; i++) {
       struct output_ctx *octx = &outputs[i];