From 1f3e865eb4f43feb97f65f923e4fd3952afb8ea4 Mon Sep 17 00:00:00 2001 From: Yuri Putivsky Date: Mon, 3 Feb 2020 14:56:24 -0800 Subject: [PATCH] Integrated base decoder into VideoReader class and video_utils.py (#1766) Summary: Pull Request resolved: https://github.com/pytorch/vision/pull/1766 Replaced FfmpegDecoder (incompativle with VUE) by base decoder (compatible with VUE). Modified python utilities video_utils.py for internal simplification. Public interface got preserved. Reviewed By: fmassa Differential Revision: D19415903 fbshipit-source-id: 4d7a0158bd77bac0a18732fe4183fdd9a57f6402 --- setup.py | 32 +- .../csrc/cpu/decoder/audio_sampler.cpp | 40 +- torchvision/csrc/cpu/decoder/audio_sampler.h | 2 - torchvision/csrc/cpu/decoder/audio_stream.cpp | 53 +- torchvision/csrc/cpu/decoder/audio_stream.h | 5 - torchvision/csrc/cpu/decoder/cc_stream.cpp | 2 - torchvision/csrc/cpu/decoder/cc_stream.h | 2 - torchvision/csrc/cpu/decoder/decoder.cpp | 217 ++++--- torchvision/csrc/cpu/decoder/decoder.h | 20 +- torchvision/csrc/cpu/decoder/defs.h | 83 ++- .../csrc/cpu/decoder/memory_buffer.cpp | 75 +++ torchvision/csrc/cpu/decoder/memory_buffer.h | 25 + .../csrc/cpu/decoder/seekable_buffer.cpp | 159 +++-- .../csrc/cpu/decoder/seekable_buffer.h | 29 +- torchvision/csrc/cpu/decoder/stream.cpp | 201 +++++-- torchvision/csrc/cpu/decoder/stream.h | 38 +- .../csrc/cpu/decoder/subtitle_sampler.cpp | 2 - .../csrc/cpu/decoder/subtitle_sampler.h | 2 - .../csrc/cpu/decoder/subtitle_stream.cpp | 31 +- .../csrc/cpu/decoder/subtitle_stream.h | 8 +- torchvision/csrc/cpu/decoder/sync_decoder.cpp | 35 +- torchvision/csrc/cpu/decoder/sync_decoder.h | 7 +- .../csrc/cpu/decoder/sync_decoder_test.cpp | 139 ++++- torchvision/csrc/cpu/decoder/time_keeper.cpp | 12 +- torchvision/csrc/cpu/decoder/time_keeper.h | 8 +- torchvision/csrc/cpu/decoder/util.cpp | 2 - torchvision/csrc/cpu/decoder/util.h | 2 - .../csrc/cpu/decoder/video_sampler.cpp | 2 - torchvision/csrc/cpu/decoder/video_sampler.h | 2 - torchvision/csrc/cpu/decoder/video_stream.cpp | 58 +- torchvision/csrc/cpu/decoder/video_stream.h | 9 +- .../cpu/video_reader/FfmpegAudioSampler.cpp | 118 ---- .../cpu/video_reader/FfmpegAudioSampler.h | 32 - .../cpu/video_reader/FfmpegAudioStream.cpp | 103 ---- .../csrc/cpu/video_reader/FfmpegAudioStream.h | 54 -- .../csrc/cpu/video_reader/FfmpegDecoder.cpp | 412 ------------- .../csrc/cpu/video_reader/FfmpegDecoder.h | 127 ---- .../csrc/cpu/video_reader/FfmpegHeaders.h | 13 - .../csrc/cpu/video_reader/FfmpegSampler.h | 16 - .../csrc/cpu/video_reader/FfmpegStream.cpp | 188 ------ .../csrc/cpu/video_reader/FfmpegStream.h | 69 --- .../csrc/cpu/video_reader/FfmpegUtil.cpp | 111 ---- .../csrc/cpu/video_reader/FfmpegUtil.h | 27 - .../cpu/video_reader/FfmpegVideoSampler.cpp | 90 --- .../cpu/video_reader/FfmpegVideoSampler.h | 32 - .../cpu/video_reader/FfmpegVideoStream.cpp | 115 ---- .../csrc/cpu/video_reader/FfmpegVideoStream.h | 54 -- .../csrc/cpu/video_reader/Interface.cpp | 22 - torchvision/csrc/cpu/video_reader/Interface.h | 127 ---- .../csrc/cpu/video_reader/VideoReader.cpp | 547 ++++++++++++------ .../csrc/cpu/video_reader/VideoReader.h | 96 --- torchvision/csrc/cpu/video_reader/util.cpp | 60 -- torchvision/csrc/cpu/video_reader/util.h | 26 - 53 files changed, 1144 insertions(+), 2597 deletions(-) create mode 100644 torchvision/csrc/cpu/decoder/memory_buffer.cpp create mode 100644 torchvision/csrc/cpu/decoder/memory_buffer.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegAudioStream.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegAudioStream.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegDecoder.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegDecoder.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegHeaders.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegSampler.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegStream.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegStream.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegUtil.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegUtil.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegVideoStream.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegVideoStream.h delete mode 100644 torchvision/csrc/cpu/video_reader/Interface.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/Interface.h delete mode 100644 torchvision/csrc/cpu/video_reader/util.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/util.h diff --git a/setup.py b/setup.py index 4ebf6355e7a..bba3bb6fe45 100644 --- a/setup.py +++ b/setup.py @@ -180,41 +180,21 @@ def get_extensions(): ffmpeg_root = os.path.dirname(ffmpeg_bin) ffmpeg_include_dir = os.path.join(ffmpeg_root, 'include') - # TorchVision video reader + # TorchVision base decoder + video reader video_reader_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video_reader') video_reader_src = glob.glob(os.path.join(video_reader_src_dir, "*.cpp")) - - ext_modules.append( - CppExtension( - 'torchvision.video_reader', - video_reader_src, - include_dirs=[ - video_reader_src_dir, - ffmpeg_include_dir, - extensions_dir, - ], - libraries=[ - 'avcodec', - 'avformat', - 'avutil', - 'swresample', - 'swscale', - ], - extra_compile_args=["-std=c++14"], - extra_link_args=["-std=c++14"], - ) - ) - - # TorchVision base decoder base_decoder_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'decoder') base_decoder_src = glob.glob(os.path.join(base_decoder_src_dir, "[!sync_decoder_test]*.cpp")) + combined_src = video_reader_src + base_decoder_src + ext_modules.append( CppExtension( - 'torchvision.base_decoder', - base_decoder_src, + 'torchvision.video_reader', + combined_src, include_dirs=[ base_decoder_src_dir, + video_reader_src_dir, ffmpeg_include_dir, extensions_dir, ], diff --git a/torchvision/csrc/cpu/decoder/audio_sampler.cpp b/torchvision/csrc/cpu/decoder/audio_sampler.cpp index c10fceb852d..4092df98359 100644 --- a/torchvision/csrc/cpu/decoder/audio_sampler.cpp +++ b/torchvision/csrc/cpu/decoder/audio_sampler.cpp @@ -1,15 +1,10 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "audio_sampler.h" #include #include "util.h" -// www.ffmpeg.org/doxygen/1.1/doc_2examples_2resampling_audio_8c-example.html#a24 - -#ifndef SWR_CH_MAX -#define SWR_CH_MAX 32 -#endif +#define AVRESAMPLE_MAX_CHANNELS 32 +// www.ffmpeg.org/doxygen/1.1/doc_2examples_2resampling_audio_8c-example.html#a24 namespace ffmpeg { namespace { @@ -94,9 +89,12 @@ int AudioSampler::numOutputSamples(int inSamples) const { } int AudioSampler::getSamplesBytes(AVFrame* frame) const { - return av_get_bytes_per_sample((AVSampleFormat)params_.out.audio.format) * - numOutputSamples(frame ? frame->nb_samples : 0) * - params_.out.audio.channels; + return av_samples_get_buffer_size( + nullptr, + params_.out.audio.channels, + numOutputSamples(frame ? frame->nb_samples : 0), + (AVSampleFormat)params_.out.audio.format, + 1); } int AudioSampler::sample( @@ -104,7 +102,7 @@ int AudioSampler::sample( int inNumSamples, ByteStorage* out, int outNumSamples) { - uint8_t* outPlanes[SWR_CH_MAX] = {nullptr}; + uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr}; int result; if ((result = preparePlanes( params_.out.audio, out->writableTail(), outNumSamples, outPlanes)) < @@ -140,9 +138,12 @@ int AudioSampler::sample(AVFrame* frame, ByteStorage* out) { return 0; } - const auto samplesBytes = - av_get_bytes_per_sample((AVSampleFormat)params_.out.audio.format) * - outNumSamples * params_.out.audio.channels; + const auto samplesBytes = av_samples_get_buffer_size( + nullptr, + params_.out.audio.channels, + outNumSamples, + (AVSampleFormat)params_.out.audio.format, + 1); // bytes must be allocated CHECK_LE(samplesBytes, out->tail()); @@ -167,14 +168,17 @@ int AudioSampler::sample(const ByteStorage* in, ByteStorage* out) { return 0; } - const auto samplesBytes = - av_get_bytes_per_sample((AVSampleFormat)params_.out.audio.format) * - outNumSamples * params_.out.audio.channels; + const auto samplesBytes = av_samples_get_buffer_size( + nullptr, + params_.out.audio.channels, + outNumSamples, + (AVSampleFormat)params_.out.audio.format, + 1); out->clear(); out->ensure(samplesBytes); - uint8_t* inPlanes[SWR_CH_MAX] = {nullptr}; + uint8_t* inPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr}; int result; if (in && (result = preparePlanes( diff --git a/torchvision/csrc/cpu/decoder/audio_sampler.h b/torchvision/csrc/cpu/decoder/audio_sampler.h index d68a21ea20e..c6a021d2084 100644 --- a/torchvision/csrc/cpu/decoder/audio_sampler.h +++ b/torchvision/csrc/cpu/decoder/audio_sampler.h @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "defs.h" diff --git a/torchvision/csrc/cpu/decoder/audio_stream.cpp b/torchvision/csrc/cpu/decoder/audio_stream.cpp index 17ab9fceb7b..ed4d6622ecd 100644 --- a/torchvision/csrc/cpu/decoder/audio_stream.cpp +++ b/torchvision/csrc/cpu/decoder/audio_stream.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "audio_stream.h" #include #include @@ -8,11 +6,23 @@ namespace ffmpeg { namespace { +bool operator==(const AudioFormat& x, const AVFrame& y) { + return x.samples == y.sample_rate && x.channels == y.channels && + x.format == y.format; +} + bool operator==(const AudioFormat& x, const AVCodecContext& y) { return x.samples == y.sample_rate && x.channels == y.channels && x.format == y.sample_fmt; } +AudioFormat& toAudioFormat(AudioFormat& x, const AVFrame& y) { + x.samples = y.sample_rate; + x.channels = y.channels; + x.format = y.format; + return x; +} + AudioFormat& toAudioFormat(AudioFormat& x, const AVCodecContext& y) { x.samples = y.sample_rate; x.channels = y.channels; @@ -29,7 +39,8 @@ AudioStream::AudioStream( : Stream( inputCtx, MediaFormat::makeMediaFormat(format, index), - convertPtsToWallTime) {} + convertPtsToWallTime, + 0) {} AudioStream::~AudioStream() { if (sampler_) { @@ -65,12 +76,15 @@ int AudioStream::initFormat() { int AudioStream::estimateBytes(bool flush) { ensureSampler(); - if (!(sampler_->getInputFormat().audio == *codecCtx_)) { + // check if input format gets changed + if (flush ? !(sampler_->getInputFormat().audio == *codecCtx_) + : !(sampler_->getInputFormat().audio == *frame_)) { // - reinit sampler SamplerParameters params; params.type = format_.type; params.out = format_.format; - toAudioFormat(params.in.audio, *codecCtx_); + flush ? toAudioFormat(params.in.audio, *codecCtx_) + : toAudioFormat(params.in.audio, *frame_); if (flush || !sampler_->init(params)) { return -1; } @@ -84,7 +98,7 @@ int AudioStream::estimateBytes(bool flush) { << ", channels: " << format_.format.audio.channels << ", format: " << format_.format.audio.format; } - return sampler_->getSamplesBytes(frame_); + return sampler_->getSamplesBytes(flush ? nullptr : frame_); } int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) { @@ -92,31 +106,4 @@ int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) { return sampler_->sample(flush ? nullptr : frame_, out); } -void AudioStream::setHeader(DecoderHeader* header) { - header->seqno = numGenerator_++; - - if (codecCtx_->time_base.num != 0) { - header->pts = av_rescale_q( - av_frame_get_best_effort_timestamp(frame_), - codecCtx_->time_base, - AV_TIME_BASE_Q); - } else { - // If the codec time_base is missing then we would've skipped the - // rescalePackage step to rescale to codec time_base, so here we can - // rescale straight from the stream time_base into AV_TIME_BASE_Q. - header->pts = av_rescale_q( - av_frame_get_best_effort_timestamp(frame_), - inputCtx_->streams[format_.stream]->time_base, - AV_TIME_BASE_Q); - } - - if (convertPtsToWallTime_) { - keeper_.adjust(header->pts); - } - - header->keyFrame = 1; - header->fps = std::numeric_limits::quiet_NaN(); - header->format = format_; -} - } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/audio_stream.h b/torchvision/csrc/cpu/decoder/audio_stream.h index c7708a3356d..4d200114e4a 100644 --- a/torchvision/csrc/cpu/decoder/audio_stream.h +++ b/torchvision/csrc/cpu/decoder/audio_stream.h @@ -1,10 +1,7 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "audio_sampler.h" #include "stream.h" -#include "time_keeper.h" namespace ffmpeg { @@ -25,13 +22,11 @@ class AudioStream : public Stream { int initFormat() override; int estimateBytes(bool flush) override; int copyFrameBytes(ByteStorage* out, bool flush) override; - void setHeader(DecoderHeader* header) override; void ensureSampler(); private: std::unique_ptr sampler_; - TimeKeeper keeper_; }; } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/cc_stream.cpp b/torchvision/csrc/cpu/decoder/cc_stream.cpp index 47de485b100..7b443146289 100644 --- a/torchvision/csrc/cpu/decoder/cc_stream.cpp +++ b/torchvision/csrc/cpu/decoder/cc_stream.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "cc_stream.h" namespace ffmpeg { diff --git a/torchvision/csrc/cpu/decoder/cc_stream.h b/torchvision/csrc/cpu/decoder/cc_stream.h index 34506d3259f..d8c98f7be23 100644 --- a/torchvision/csrc/cpu/decoder/cc_stream.h +++ b/torchvision/csrc/cpu/decoder/cc_stream.h @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "subtitle_stream.h" diff --git a/torchvision/csrc/cpu/decoder/decoder.cpp b/torchvision/csrc/cpu/decoder/decoder.cpp index d8f324863e4..b78c1e47214 100644 --- a/torchvision/csrc/cpu/decoder/decoder.cpp +++ b/torchvision/csrc/cpu/decoder/decoder.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "decoder.h" #include #include @@ -15,9 +13,8 @@ namespace ffmpeg { namespace { -constexpr ssize_t kMinSeekBufferSize = 1024; -constexpr ssize_t kMaxSeekBufferSize = 4 * 1024; -constexpr size_t kIoBufferSize = 4 * 1024; +constexpr size_t kIoBufferSize = 96 * 1024; +constexpr size_t kIoPaddingSize = AV_INPUT_BUFFER_PADDING_SIZE; constexpr size_t kLogBufferSize = 1024; int ffmpeg_lock(void** mutex, enum AVLockOp op) { @@ -205,7 +202,7 @@ void Decoder::initOnce() { av_lockmgr_register(&ffmpeg_lock); av_log_set_callback(Decoder::logFunction); av_log_set_level(AV_LOG_ERROR); - LOG(INFO) << "Registered ffmpeg libs"; + VLOG(1) << "Registered ffmpeg libs"; }); } @@ -213,10 +210,6 @@ Decoder::Decoder() { initOnce(); } -Decoder::~Decoder() { - cleanUp(); -} - bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) { cleanUp(); @@ -229,42 +222,28 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) { // set callback and params params_ = params; - auto tmpCtx = avformat_alloc_context(); - - if (!tmpCtx) { + if (!(inputCtx_ = avformat_alloc_context())) { LOG(ERROR) << "Cannot allocate format context"; return false; } AVInputFormat* fmt = nullptr; + int result = 0; if (in) { - const size_t avioCtxBufferSize = kIoBufferSize; - uint8_t* avioCtxBuffer = (uint8_t*)av_malloc(avioCtxBufferSize); - if (!avioCtxBuffer) { - LOG(ERROR) << "av_malloc cannot allocate " << avioCtxBufferSize - << " bytes"; - avformat_close_input(&tmpCtx); - cleanUp(); - return false; - } - - bool canSeek = in(nullptr, 0, 0) == 0; - - if (!seekableBuffer_.init( - std::forward(in), - kMinSeekBufferSize, - kMaxSeekBufferSize, - params_.timeoutMs)) { - LOG(ERROR) << "seekable buffer initialization failed"; - av_free(avioCtxBuffer); - avformat_close_input(&tmpCtx); + ImageType type = ImageType::UNKNOWN; + if ((result = seekableBuffer_.init( + std::forward(in), + params_.timeoutMs, + params_.maxSeekableBytes, + params_.isImage ? &type : nullptr)) < 0) { + LOG(ERROR) << "can't initiate seekable buffer"; cleanUp(); return false; } if (params_.isImage) { const char* fmtName = "image2"; - switch (seekableBuffer_.getImageType()) { + switch (type) { case ImageType::JPEG: fmtName = "jpeg_pipe"; break; @@ -281,6 +260,16 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) { fmt = av_find_input_format(fmtName); } + const size_t avioCtxBufferSize = kIoBufferSize; + uint8_t* avioCtxBuffer = + (uint8_t*)av_malloc(avioCtxBufferSize + kIoPaddingSize); + if (!avioCtxBuffer) { + LOG(ERROR) << "av_malloc cannot allocate " << avioCtxBufferSize + << " bytes"; + cleanUp(); + return false; + } + if (!(avioCtx_ = avio_alloc_context( avioCtxBuffer, avioCtxBufferSize, @@ -288,36 +277,23 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) { reinterpret_cast(this), &Decoder::readFunction, nullptr, - canSeek ? &Decoder::seekFunction : nullptr))) { + result == 1 ? &Decoder::seekFunction : nullptr))) { LOG(ERROR) << "avio_alloc_context failed"; av_free(avioCtxBuffer); - avformat_close_input(&tmpCtx); cleanUp(); return false; } - tmpCtx->pb = avioCtx_; + inputCtx_->pb = avioCtx_; + inputCtx_->flags |= AVFMT_FLAG_CUSTOM_IO; } - interrupted_ = false; - // ffmpeg avformat_open_input call can hang if media source doesn't respond - // set a guard for handle such situations - std::promise p; - std::future f = p.get_future(); - std::thread guard([&f, this]() { - auto timeout = std::chrono::milliseconds(params_.timeoutMs); - if (std::future_status::timeout == f.wait_for(timeout)) { - LOG(ERROR) << "Cannot open stream within " << params_.timeoutMs << " ms"; - interrupted_ = true; - } - }); - - tmpCtx->opaque = reinterpret_cast(this); - tmpCtx->interrupt_callback.callback = Decoder::shutdownFunction; - tmpCtx->interrupt_callback.opaque = reinterpret_cast(this); + inputCtx_->opaque = reinterpret_cast(this); + inputCtx_->interrupt_callback.callback = Decoder::shutdownFunction; + inputCtx_->interrupt_callback.opaque = reinterpret_cast(this); // add network timeout - tmpCtx->flags |= AVFMT_FLAG_NONBLOCK; + inputCtx_->flags |= AVFMT_FLAG_NONBLOCK; AVDictionary* options = nullptr; av_dict_set_int(&options, "analyzeduration", params_.timeoutMs * 1000, 0); @@ -326,19 +302,38 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) { av_dict_set_int(&options, "listen", 1, 0); } - int result = 0; + interrupted_ = false; + + // ffmpeg avformat_open_input call can hang if media source doesn't respond + // set a guard for handle such situations, if requested + std::promise p; + std::future f = p.get_future(); + std::unique_ptr guard; + if (params_.preventStaleness) { + guard = std::make_unique([&f, this]() { + auto timeout = std::chrono::milliseconds(params_.timeoutMs); + if (std::future_status::timeout == f.wait_for(timeout)) { + LOG(ERROR) << "Cannot open stream within " << params_.timeoutMs + << " ms"; + interrupted_ = true; + } + }); + } + if (fmt) { - result = avformat_open_input(&tmpCtx, nullptr, fmt, &options); + result = avformat_open_input(&inputCtx_, nullptr, fmt, &options); } else { result = - avformat_open_input(&tmpCtx, params_.uri.c_str(), nullptr, &options); + avformat_open_input(&inputCtx_, params_.uri.c_str(), nullptr, &options); } - av_dict_free(&options); - p.set_value(true); - guard.join(); + av_dict_free(&options); - inputCtx_ = tmpCtx; + if (guard) { + p.set_value(true); + guard->join(); + guard.reset(); + } if (result < 0 || interrupted_) { LOG(ERROR) << "avformat_open_input failed, error: " @@ -356,7 +351,7 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) { return false; } - if (!activateStreams()) { + if (!openStreams()) { LOG(ERROR) << "Cannot activate streams"; cleanUp(); return false; @@ -364,20 +359,19 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) { onInit(); - if (params.startOffsetMs != 0) { - av_seek_frame( - inputCtx_, - -1, - params.startOffsetMs * AV_TIME_BASE / 1000, - AVSEEK_FLAG_FRAME | AVSEEK_FLAG_ANY); + if (params.startOffset != 0) { + auto offset = params.startOffset <= params.seekAccuracy + ? 0 + : params.startOffset - params.seekAccuracy; + + av_seek_frame(inputCtx_, -1, offset, AVSEEK_FLAG_BACKWARD); } - LOG(INFO) << "Decoder initialized, log level: " << params_.logLevel; - outOfRange_ = false; + VLOG(1) << "Decoder initialized, log level: " << params_.logLevel; return true; } -bool Decoder::activateStreams() { +bool Decoder::openStreams() { for (int i = 0; i < inputCtx_->nb_streams; i++) { // - find the corespondent format at params_.formats set MediaFormat format; @@ -418,6 +412,7 @@ bool Decoder::activateStreams() { return false; } streams_.emplace(i, std::move(stream)); + inRange_.set(i, true); } } @@ -458,8 +453,8 @@ void Decoder::cleanUp() { seekableBuffer_.shutdown(); } -int Decoder::getBytes(size_t workingTimeInMs) { - if (outOfRange_) { +int Decoder::getFrame(size_t workingTimeInMs) { + if (inRange_.none()) { return ENODATA; } // decode frames until cache is full and leave thread @@ -478,14 +473,16 @@ int Decoder::getBytes(size_t workingTimeInMs) { return std::chrono::steady_clock::now() <= end; }; - int result = ETIMEDOUT; + int result = 0; size_t decodingErrors = 0; - while (!interrupted_ && watcher()) { + bool decodedFrame = false; + while (!interrupted_ && inRange_.any() && !decodedFrame && watcher()) { result = av_read_frame(inputCtx_, &avPacket); if (result == AVERROR(EAGAIN)) { VLOG(4) << "Decoder is busy..."; + std::this_thread::yield(); result = 0; // reset error, EAGAIN is not an error at all - break; + continue; } else if (result == AVERROR_EOF) { flushStreams(); VLOG(1) << "End of stream"; @@ -499,24 +496,24 @@ int Decoder::getBytes(size_t workingTimeInMs) { // get stream auto stream = findByIndex(avPacket.stream_index); - if (stream == nullptr) { + if (stream == nullptr || !inRange_.test(stream->getIndex())) { av_packet_unref(&avPacket); continue; } - stream->rescalePackage(&avPacket); - - AVPacket copyPacket = avPacket; - size_t numConsecutiveNoBytes = 0; // it can be only partial decoding of the package bytes do { // decode package - if ((result = processPacket(stream, ©Packet)) < 0) { + bool gotFrame = false; + bool hasMsg = false; + // packet either got consumed completely or not at all + if ((result = processPacket(stream, &avPacket, &gotFrame, &hasMsg)) < 0) { + LOG(ERROR) << "processPacket failed with code: " << result; break; } - if (result == 0 && params_.maxProcessNoBytes != 0 && + if (!gotFrame && params_.maxProcessNoBytes != 0 && ++numConsecutiveNoBytes > params_.maxProcessNoBytes) { LOG(ERROR) << "Exceeding max amount of consecutive no bytes"; break; @@ -525,14 +522,14 @@ int Decoder::getBytes(size_t workingTimeInMs) { numConsecutiveNoBytes = 0; } - copyPacket.size -= result; - copyPacket.data += result; - } while (copyPacket.size > 0); + decodedFrame |= hasMsg; + } while (result == 0); // post loop check if (result < 0) { if (params_.maxPackageErrors != 0 && // check errors ++decodingErrors >= params_.maxPackageErrors) { // reached the limit + LOG(ERROR) << "Exceeding max amount of consecutive package errors"; break; } } else { @@ -546,7 +543,27 @@ int Decoder::getBytes(size_t workingTimeInMs) { av_packet_unref(&avPacket); - return result; + VLOG(2) << "Interrupted loop" + << ", interrupted_ " << interrupted_ << ", inRange_.any() " + << inRange_.any() << ", decodedFrame " << decodedFrame << ", result " + << result; + + // loop can be terminated, either by: + // 1. explcitly iterrupted + // 2. terminated by workable timeout + // 3. unrecoverable error or ENODATA (end of stream) + // 4. decoded frames pts are out of the specified range + // 5. success decoded frame + if (interrupted_) { + return EINTR; + } + if (result != 0) { + return result; + } + if (inRange_.none()) { + return ENODATA; + } + return 0; } Stream* Decoder::findByIndex(int streamIndex) const { @@ -563,17 +580,23 @@ Stream* Decoder::findByType(const MediaFormat& format) const { return nullptr; } -int Decoder::processPacket(Stream* stream, AVPacket* packet) { +int Decoder::processPacket(Stream* stream, + AVPacket* packet, + bool* gotFrame, + bool* hasMsg) { // decode package - int gotFrame = 0; int result; DecoderOutputMessage msg; msg.payload = createByteStorage(0); - if ((result = stream->decodeFrame(packet, &gotFrame)) >= 0 && gotFrame && - stream->getFrameBytes(&msg, params_.headerOnly) > 0) { + *hasMsg = false; + if ((result = stream->decodePacket( + packet, &msg, params_.headerOnly, gotFrame)) >= 0 && *gotFrame) { // check end offset - if (params_.endOffsetMs <= 0 || - !(outOfRange_ = msg.header.pts > params_.endOffsetMs * 1000)) { + bool endInRange = + params_.endOffset <= 0 || msg.header.pts <= params_.endOffset; + inRange_.set(stream->getIndex(), endInRange); + if (endInRange && msg.header.pts >= params_.startOffset) { + *hasMsg = true; push(std::move(msg)); } } @@ -587,9 +610,13 @@ void Decoder::flushStreams() { while (msg.payload = createByteStorage(0), stream.second->flush(&msg, params_.headerOnly) > 0) { // check end offset - if (params_.endOffsetMs <= 0 || - !(outOfRange_ = msg.header.pts > params_.endOffsetMs * 1000)) { + bool endInRange = + params_.endOffset <= 0 || msg.header.pts <= params_.endOffset; + inRange_.set(stream.second->getIndex(), endInRange); + if (endInRange && msg.header.pts >= params_.startOffset) { push(std::move(msg)); + } else { + msg.payload.reset(); } } } diff --git a/torchvision/csrc/cpu/decoder/decoder.h b/torchvision/csrc/cpu/decoder/decoder.h index 971eec10aa4..11894fabb74 100644 --- a/torchvision/csrc/cpu/decoder/decoder.h +++ b/torchvision/csrc/cpu/decoder/decoder.h @@ -1,7 +1,7 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once +#include +#include #include "seekable_buffer.h" #include "stream.h" @@ -15,7 +15,6 @@ namespace ffmpeg { class Decoder : public MediaDecoder { public: Decoder(); - ~Decoder() override; // MediaDecoder overrides bool init(const DecoderParameters& params, DecoderInCallback&& in) override; @@ -25,9 +24,10 @@ class Decoder : public MediaDecoder { protected: // function does actual work, derived class calls it in working thread - // periodically. On success method returns 0, ENOADATA on EOF and error on + // periodically. On success method returns 0, ENOADATA on EOF, ETIMEDOUT if + // no frames got decoded in the specified timeout time, and error on // unrecoverable error. - int getBytes(size_t workingTimeInMs = 100); + int getFrame(size_t workingTimeInMs = 100); // Derived class must override method and consume the provided message virtual void push(DecoderOutputMessage&& buffer) = 0; @@ -56,13 +56,15 @@ class Decoder : public MediaDecoder { virtual int64_t seekCallback(int64_t offset, int whence); virtual int shutdownCallback(); - bool activateStreams(); + bool openStreams(); Stream* findByIndex(int streamIndex) const; Stream* findByType(const MediaFormat& format) const; - int processPacket(Stream* stream, AVPacket* packet); + int processPacket(Stream* stream, + AVPacket* packet, + bool* gotFrame, + bool* hasMsg); void flushStreams(); void cleanUp(); - private: DecoderParameters params_; SeekableBuffer seekableBuffer_; @@ -72,6 +74,6 @@ class Decoder : public MediaDecoder { AVFormatContext* inputCtx_{nullptr}; AVIOContext* avioCtx_{nullptr}; std::unordered_map> streams_; - bool outOfRange_{false}; + std::bitset<64> inRange_; }; } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/defs.h b/torchvision/csrc/cpu/decoder/defs.h index 62854668b90..2e282bb59c6 100644 --- a/torchvision/csrc/cpu/decoder/defs.h +++ b/torchvision/csrc/cpu/decoder/defs.h @@ -27,7 +27,7 @@ struct AudioFormat { size_t samples{0}; // number samples per second (frequency) size_t channels{0}; // number of channels - ssize_t format{-1}; // AVSampleFormat, auto AV_SAMPLE_FMT_NONE + long format{-1}; // AVSampleFormat, auto AV_SAMPLE_FMT_NONE size_t padding[2]; // -- alignment 40 bytes }; @@ -42,7 +42,7 @@ struct VideoFormat { size_t width{0}; // width in pixels size_t height{0}; // height in pixels - ssize_t format{-1}; // AVPixelFormat, auto AV_PIX_FMT_NONE + long format{-1}; // AVPixelFormat, auto AV_PIX_FMT_NONE size_t minDimension{0}; // choose min dimension and rescale accordingly size_t cropImage{0}; // request image crop // -- alignment 40 bytes @@ -50,7 +50,7 @@ struct VideoFormat { // subtitle/cc struct SubtitleFormat { - ssize_t type{0}; // AVSubtitleType, auto SUBTITLE_NONE + long type{0}; // AVSubtitleType, auto SUBTITLE_NONE size_t padding[4]; // -- alignment 40 bytes }; @@ -94,28 +94,27 @@ struct MediaFormat { } } - explicit MediaFormat(ssize_t s = -1) - : type(TYPE_AUDIO), stream(s), format() {} - explicit MediaFormat(int x, ssize_t s = -1) + explicit MediaFormat(long s = -1) : type(TYPE_AUDIO), stream(s), format() {} + explicit MediaFormat(int x, long s = -1) : type(TYPE_VIDEO), stream(s), format(x) {} - explicit MediaFormat(char x, ssize_t s = -1) + explicit MediaFormat(char x, long s = -1) : type(TYPE_SUBTITLE), stream(s), format(x) {} - explicit MediaFormat(double x, ssize_t s = -1) + explicit MediaFormat(double x, long s = -1) : type(TYPE_CC), stream(s), format(x) {} - static MediaFormat makeMediaFormat(AudioFormat format, ssize_t stream) { + static MediaFormat makeMediaFormat(AudioFormat format, long stream) { MediaFormat result(stream); result.format.audio = format; return result; } - static MediaFormat makeMediaFormat(VideoFormat format, ssize_t stream) { + static MediaFormat makeMediaFormat(VideoFormat format, long stream) { MediaFormat result(0, stream); result.format.video = format; return result; } - static MediaFormat makeMediaFormat(SubtitleFormat format, ssize_t stream) { + static MediaFormat makeMediaFormat(SubtitleFormat format, long stream) { MediaFormat result('0', stream); result.format.subtitle = format; return result; @@ -126,17 +125,17 @@ struct MediaFormat { // stream index: // set -1 for one stream auto detection, -2 for all streams auto detection, // >= 0, specified stream, if caller knows the stream index (unlikely) - ssize_t stream; + long stream; // union keeps one of the possible formats, defined by MediaType FormatUnion format; // output parameters, ignored while initialization // time base numerator - ssize_t num{0}; + long num{0}; // time base denominator - ssize_t den{1}; - // duration of the stream, in stream time base, if available - ssize_t duration{-1}; + long den{1}; + // duration of the stream, in miscroseconds, if available + long duration{-1}; }; struct DecoderParameters { @@ -146,29 +145,33 @@ struct DecoderParameters { // timeout on getting bytes for decoding size_t timeoutMs{1000}; // logging level, default AV_LOG_PANIC - ssize_t logLevel{0}; + long logLevel{0}; // when decoder would give up, 0 means never size_t maxPackageErrors{0}; // max allowed consecutive times no bytes are processed. 0 means for infinite. size_t maxProcessNoBytes{0}; - // start offset - ssize_t startOffsetMs{0}; - // end offset - ssize_t endOffsetMs{-1}; + // start offset (us) + long startOffset{0}; + // end offset (us) + long endOffset{-1}; // logging id int64_t loggingUuid{0}; + // internal max seekable buffer size + size_t maxSeekableBytes{0}; // adjust header pts to the epoch time bool convertPtsToWallTime{false}; // indicate if input stream is an encoded image bool isImage{false}; - // what media types should be processed, default none - std::set formats; // listen and wait for new rtmp stream bool listen{false}; // don't copy frame body, only header bool headerOnly{false}; - // seek tolerated accuracy - double seekAccuracySec{1.0}; + // interrupt init method on timeout + bool preventStaleness{true}; + // seek tolerated accuracy (us) + double seekAccuracy{1000000.0}; + // what media types should be processed, default none + std::set formats; }; struct DecoderHeader { @@ -176,7 +179,7 @@ struct DecoderHeader { size_t seqno{0}; // decoded timestamp in microseconds from either beginning of the stream or // from epoch time, see DecoderParameters::convertPtsToWallTime - ssize_t pts{0}; + long pts{0}; // decoded key frame size_t keyFrame{0}; // frames per second, valid only for video streams @@ -219,27 +222,21 @@ struct DecoderOutputMessage { * Normally input/output parameter @out set to valid, not null buffer pointer, * which indicates "read" call, however there are "seek" modes as well. - * @out != nullptr, @size != 0, @timeoutMs != 0 => read from the current offset - * @size bytes => return number bytes read, 0 if no more bytes available, < 0 - * on error. - - * @out == nullptr, @size == 0, @timeoutMs == 0 => does provider support "seek" - * capability in a first place? return 0 on success, < 0 if "seek" mode is not - * supported. - - * @out == nullptr, @size > 0 => seek the absolute offset == @size, return - * 0 on success and < 0 on error. + * @out != nullptr => read from the current offset, @whence got ignored, + * @size bytes to read => return number bytes got read, 0 if no more bytes + * available, < 0 on error. - * @out == nullptr, @size < 0 => seek the end of the media, return 0 on success - * and < 0 on failure. Provider might support seek doesn't know the media size. + * @out == nullptr, @timeoutMs == 0 => does provider support "seek" + * capability in a first place? @size & @whence got ignored, return 0 on + * success, < 0 if "seek" mode is not supported. - * Additionally if @out is set to null AND @size is set to zero AND - * @timeoutMs is set to zero, caller requests the seek capability of the - * provider, i.e. returns 0 on success and error if provider is not supporting - * seek. + * @out == nullptr, @timeoutMs != 0 => normal seek call + * offset == @size, i.e. @whence = [SEEK_SET, SEEK_CUR, SEEK_END, AVSEEK_SIZE) + * return < 0 on error, position if @whence = [SEEK_SET, SEEK_CUR, SEEK_END], + * length of buffer if @whence = [AVSEEK_SIZE]. */ using DecoderInCallback = - std::function; + std::function; using DecoderOutCallback = std::function; diff --git a/torchvision/csrc/cpu/decoder/memory_buffer.cpp b/torchvision/csrc/cpu/decoder/memory_buffer.cpp new file mode 100644 index 00000000000..d91213fdcbb --- /dev/null +++ b/torchvision/csrc/cpu/decoder/memory_buffer.cpp @@ -0,0 +1,75 @@ +#include "memory_buffer.h" +#include + +extern "C" { +#include +} + +namespace ffmpeg { + +MemoryBuffer::MemoryBuffer(const uint8_t* buffer, size_t size) + : buffer_(buffer), len_(size) {} + +int MemoryBuffer::read(uint8_t* buf, int size) { + if (pos_ < len_) { + auto available = std::min(int(len_ - pos_), size); + memcpy(buf, buffer_ + pos_, available); + pos_ += available; + return available; + } + + return 0; +} + +int64_t MemoryBuffer::seek(int64_t offset, int whence) { + if (whence & AVSEEK_SIZE) { + return len_; + } + + // remove force flag + whence &= ~AVSEEK_FORCE; + + switch (whence) { + case SEEK_SET: + if (offset >= 0 && offset <= len_) { + pos_ = offset; + } + break; + case SEEK_END: + if (len_ + offset >= 0 && len_ + offset <= len_) { + pos_ = len_ + offset; + } + break; + case SEEK_CUR: + if (pos_ + offset > 0 && pos_ + offset <= len_) { + pos_ += offset; + } + break; + default: + LOG(ERROR) << "Unknown whence flag gets provided: " << whence; + } + return pos_; +} + +/* static */ +DecoderInCallback MemoryBuffer::getCallback( + const uint8_t* buffer, + size_t size) { + MemoryBuffer object(buffer, size); + return + [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable + -> int { + if (out) { // see defs.h file + // read mode + return object.read(out, size); + } + // seek mode + if (!timeoutMs) { + // seek capabilty, yes - supported + return 0; + } + return object.seek(size, whence); + }; +} + +} // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/memory_buffer.h b/torchvision/csrc/cpu/decoder/memory_buffer.h new file mode 100644 index 00000000000..909626d3cae --- /dev/null +++ b/torchvision/csrc/cpu/decoder/memory_buffer.h @@ -0,0 +1,25 @@ +#pragma once + +#include "defs.h" + +namespace ffmpeg { + +/** + * Class uses external memory buffer and implements a seekable interface. + */ +class MemoryBuffer { + public: + explicit MemoryBuffer(const uint8_t* buffer, size_t size); + int64_t seek(int64_t offset, int whence); + int read(uint8_t* buf, int size); + + // static constructor for decoder callback. + static DecoderInCallback getCallback(const uint8_t* buffer, size_t size); + + private: + const uint8_t* buffer_; // set at construction time + long pos_{0}; // current position + long len_{0}; // bytes in buffer +}; + +} // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/seekable_buffer.cpp b/torchvision/csrc/cpu/decoder/seekable_buffer.cpp index 8d159b789bf..2e6732a2f50 100644 --- a/torchvision/csrc/cpu/decoder/seekable_buffer.cpp +++ b/torchvision/csrc/cpu/decoder/seekable_buffer.cpp @@ -1,8 +1,7 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "seekable_buffer.h" #include #include +#include "memory_buffer.h" extern "C" { #include @@ -10,17 +9,59 @@ extern "C" { namespace ffmpeg { -bool SeekableBuffer::init( +int SeekableBuffer::init( DecoderInCallback&& in, - ssize_t minSize, - ssize_t maxSize, - uint64_t timeoutMs) { + uint64_t timeoutMs, + size_t maxSeekableBytes, + ImageType* type) { + shutdown(); + isSeekable_ = in(nullptr, 0, 0, 0) == 0; + if (isSeekable_) { // seekable + if (type) { + if (!readBytes(in, 8, timeoutMs)) { + return -1; + } + setImageType(type); + end_ = 0; + eof_ = false; + std::vector().swap(buffer_); + // reset callback + if (in(nullptr, 0, SEEK_SET, timeoutMs)) { + return -1; + } + } + inCallback_ = std::forward(in); + return 1; + } + + if (!readBytes(in, maxSeekableBytes, timeoutMs)) { + return -1; + } + + if (type) { + setImageType(type); + } + + if (eof_) { + end_ = 0; + eof_ = false; + // reuse MemoryBuffer functionality + inCallback_ = MemoryBuffer::getCallback(buffer_.data(), buffer_.size()); + isSeekable_ = true; + return 1; + } inCallback_ = std::forward(in); - len_ = minSize; - buffer_.resize(len_); - pos_ = 0; + return 0; +} + +bool SeekableBuffer::readBytes( + DecoderInCallback& in, + size_t maxBytes, + uint64_t timeoutMs) { + // Resize to th minimum 4K page or less + buffer_.resize(std::min(maxBytes, 4 * 1024UL)); end_ = 0; - eof_ = 0; + eof_ = false; auto end = std::chrono::steady_clock::now() + std::chrono::milliseconds(timeoutMs); @@ -28,62 +69,58 @@ bool SeekableBuffer::init( return std::chrono::steady_clock::now() <= end; }; - bool hasTime = false; - while (!eof_ && end_ < maxSize && (hasTime = watcher())) { + bool hasTime = true; + while (!eof_ && end_ < maxBytes && (hasTime = watcher())) { // lets read all bytes into available buffer - auto res = inCallback_(buffer_.data() + end_, len_ - end_, timeoutMs); + auto res = in(buffer_.data() + end_, buffer_.size() - end_, 0, timeoutMs); if (res > 0) { end_ += res; - if (end_ == len_) { - len_ = std::min(len_ * 4, maxSize); - buffer_.resize(len_); + if (end_ == buffer_.size()) { + buffer_.resize(std::min(end_ * 4UL, maxBytes)); } } else if (res == 0) { - eof_ = 1; + eof_ = true; } else { // error return false; } } - if (!hasTime) { - return false; - } + return hasTime; +} +void SeekableBuffer::setImageType(ImageType* type) { if (buffer_.size() > 2 && buffer_[0] == 0xFF && buffer_[1] == 0xD8 && buffer_[2] == 0xFF) { - imageType_ = ImageType::JPEG; + *type = ImageType::JPEG; } else if ( buffer_.size() > 3 && buffer_[1] == 'P' && buffer_[2] == 'N' && buffer_[3] == 'G') { - imageType_ = ImageType::PNG; + *type = ImageType::PNG; } else if ( buffer_.size() > 1 && ((buffer_[0] == 0x49 && buffer_[1] == 0x49) || (buffer_[0] == 0x4D && buffer_[1] == 0x4D))) { - imageType_ = ImageType::TIFF; + *type = ImageType::TIFF; + } else { + *type = ImageType::UNKNOWN; } - - return true; } int SeekableBuffer::read(uint8_t* buf, int size, uint64_t timeoutMs) { - // 1. pos_ < end_ + if (isSeekable_) { + return inCallback_(buf, size, 0, timeoutMs); + } if (pos_ < end_) { + // read cached bytes for non-seekable callback auto available = std::min(int(end_ - pos_), size); memcpy(buf, buffer_.data() + pos_, available); pos_ += available; return available; } else if (!eof_) { - auto res = inCallback_(buf, size, timeoutMs); // read through - if (res > 0) { - pos_ += res; - if (pos_ > end_ && !buffer_.empty()) { - std::vector().swap(buffer_); - } - } else if (res == 0) { - eof_ = 1; - } + // normal sequential read (see defs.h file), i.e. @buf != null + auto res = inCallback_(buf, size, 0, timeoutMs); // read through + eof_ = res == 0; return res; } else { return 0; @@ -91,57 +128,13 @@ int SeekableBuffer::read(uint8_t* buf, int size, uint64_t timeoutMs) { } int64_t SeekableBuffer::seek(int64_t offset, int whence, uint64_t timeoutMs) { - // remove force flag - whence &= ~AVSEEK_FORCE; - // get size request - int size = whence & AVSEEK_SIZE; - // remove size flag - whence &= ~AVSEEK_SIZE; - - if (size) { - return eof_ ? end_ : AVERROR(EINVAL); - } else { - switch (whence) { - case SEEK_SET: - if (offset < 0) { - return AVERROR(EINVAL); - } - if (offset <= end_) { - pos_ = offset; - return pos_; - } - if (!inCallback_(0, offset, timeoutMs)) { - pos_ = offset; - return 0; - } - break; - case SEEK_END: - if (eof_ && pos_ <= end_ && offset < 0 && end_ + offset >= 0) { - pos_ = end_ + offset; - return 0; - } - break; - case SEEK_CUR: - if (pos_ + offset < 0) { - return AVERROR(EINVAL); - } - if (pos_ + offset <= end_) { - pos_ += offset; - return 0; - } - if (!inCallback_(0, pos_ + offset, timeoutMs)) { - pos_ += offset; - return 0; - } - break; - default: - LOG(ERROR) << "Unknown whence flag gets provided: " << whence; - } - } - return AVERROR(EINVAL); // we have no idea what the media size is + return inCallback_(nullptr, offset, whence, timeoutMs); } void SeekableBuffer::shutdown() { + pos_ = end_ = 0; + eof_ = false; + std::vector().swap(buffer_); inCallback_ = nullptr; } diff --git a/torchvision/csrc/cpu/decoder/seekable_buffer.h b/torchvision/csrc/cpu/decoder/seekable_buffer.h index e8ba327e4ea..9d5729f5306 100644 --- a/torchvision/csrc/cpu/decoder/seekable_buffer.h +++ b/torchvision/csrc/cpu/decoder/seekable_buffer.h @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "defs.h" @@ -20,27 +18,28 @@ enum class ImageType { class SeekableBuffer { public: - // try to fill out buffer, returns true if EOF detected (seek will supported) - bool init( + // @type is optional, not nullptr only is image detection required + // \returns 1 is buffer seekable, 0 - if not seekable, < 0 on error + int init( DecoderInCallback&& in, - ssize_t minSize, - ssize_t maxSize, - uint64_t timeoutMs); + uint64_t timeoutMs, + size_t maxSeekableBytes, + ImageType* type); int read(uint8_t* buf, int size, uint64_t timeoutMs); int64_t seek(int64_t offset, int whence, uint64_t timeoutMs); void shutdown(); - ImageType getImageType() const { - return imageType_; - } + + private: + bool readBytes(DecoderInCallback& in, size_t maxBytes, uint64_t timeoutMs); + void setImageType(ImageType* type); private: DecoderInCallback inCallback_; std::vector buffer_; // resized at init time - ssize_t len_{0}; // current buffer size - ssize_t pos_{0}; // current position (SEEK_CUR iff pos_ < end_) - ssize_t end_{0}; // bytes in buffer [0, buffer_.size()] - ssize_t eof_{0}; // indicates the EOF - ImageType imageType_{ImageType::UNKNOWN}; + long pos_{0}; // current position (SEEK_CUR iff pos_ < end_) + long end_{0}; // current buffer size + bool eof_{0}; // indicates the EOF + bool isSeekable_{false}; // is callback seekable }; } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/stream.cpp b/torchvision/csrc/cpu/decoder/stream.cpp index 767136657b6..ce13ca05a83 100644 --- a/torchvision/csrc/cpu/decoder/stream.cpp +++ b/torchvision/csrc/cpu/decoder/stream.cpp @@ -1,22 +1,18 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "stream.h" #include #include "util.h" namespace ffmpeg { -namespace { -const size_t kDecoderHeaderSize = sizeof(DecoderHeader); -} - Stream::Stream( AVFormatContext* inputCtx, MediaFormat format, - bool convertPtsToWallTime) + bool convertPtsToWallTime, + int64_t loggingUuid) : inputCtx_(inputCtx), format_(format), - convertPtsToWallTime_(convertPtsToWallTime) {} + convertPtsToWallTime_(convertPtsToWallTime), + loggingUuid_(loggingUuid) {} Stream::~Stream() { if (frame_) { @@ -36,25 +32,30 @@ int Stream::openCodec() { auto codec_id = steam->codecpar->codec_id; AVCodec* codec = avcodec_find_decoder(codec_id); if (!codec) { - LOG(ERROR) << "avcodec_find_decoder failed for codec_id: " << int(codec_id); + LOG(ERROR) << "LoggingUuid #" << loggingUuid_ + << ", avcodec_find_decoder failed for codec_id: " + << int(codec_id); return AVERROR(EINVAL); } if (!(codecCtx_ = avcodec_alloc_context3(codec))) { - LOG(ERROR) << "avcodec_alloc_context3 fails"; + LOG(ERROR) << "LoggingUuid #" << loggingUuid_ + << ", avcodec_alloc_context3 failed"; return AVERROR(ENOMEM); } int ret; // Copy codec parameters from input stream to output codec context if ((ret = avcodec_parameters_to_context(codecCtx_, steam->codecpar)) < 0) { - LOG(ERROR) << "Failed to copy codec parameters to decoder context"; + LOG(ERROR) << "LoggingUuid #" << loggingUuid_ + << ", avcodec_parameters_to_context failed"; return ret; } // after avcodec_open2, value of codecCtx_->time_base is NOT meaningful if ((ret = avcodec_open2(codecCtx_, codec, nullptr)) < 0) { - LOG(ERROR) << "avcodec_open2 failed. " << Util::generateErrorDesc(ret); + LOG(ERROR) << "LoggingUuid #" << loggingUuid_ + << ", avcodec_open2 failed: " << Util::generateErrorDesc(ret); avcodec_free_context(&codecCtx_); codecCtx_ = nullptr; return ret; @@ -62,30 +63,41 @@ int Stream::openCodec() { frame_ = av_frame_alloc(); + // always convert to us format_.num = inputCtx_->streams[format_.stream]->time_base.num; format_.den = inputCtx_->streams[format_.stream]->time_base.den; - format_.duration = inputCtx_->streams[format_.stream]->duration; - return initFormat(); -} + switch (format_.type) { + case TYPE_VIDEO: + fps_ = av_q2d(av_guess_frame_rate( + inputCtx_, inputCtx_->streams[format_.stream], nullptr)); + break; + case TYPE_AUDIO: + fps_ = codecCtx_->sample_rate; + break; + default: + fps_ = 30.0; + } + + format_.duration = av_rescale_q( + inputCtx_->streams[format_.stream]->duration, + inputCtx_->streams[format_.stream]->time_base, + AV_TIME_BASE_Q); -// rescale package -void Stream::rescalePackage(AVPacket* packet) { - if (codecCtx_->time_base.num != 0) { - av_packet_rescale_ts( - packet, - inputCtx_->streams[format_.stream]->time_base, - codecCtx_->time_base); + if ((ret = initFormat())) { + LOG(ERROR) << "initFormat failed, type: " << format_.type; } + + return ret; } -int Stream::analyzePacket(const AVPacket* packet, int* gotFramePtr) { +int Stream::analyzePacket(const AVPacket* packet, bool* gotFrame) { int consumed = 0; int result = avcodec_send_packet(codecCtx_, packet); if (result == AVERROR(EAGAIN)) { - *gotFramePtr = 0; // no bytes get consumed, fetch frame + *gotFrame = false; // no bytes get consumed, fetch frame } else if (result == AVERROR_EOF) { - *gotFramePtr = 0; // more than one flush packet + *gotFrame = false; // more than one flush packet if (packet) { // got packet after flush, this is an error return result; @@ -95,23 +107,23 @@ int Stream::analyzePacket(const AVPacket* packet, int* gotFramePtr) { << Util::generateErrorDesc(result); return result; // error } else { - consumed = packet ? packet->size : 0; // all bytes get consumed + consumed = 1; // all bytes get consumed } result = avcodec_receive_frame(codecCtx_, frame_); if (result >= 0) { - *gotFramePtr = 1; // frame is available + *gotFrame = true; // frame is available } else if (result == AVERROR(EAGAIN)) { - *gotFramePtr = 0; // no frames at this time, needs more packets + *gotFrame = false; // no frames at this time, needs more packets if (!consumed) { // precaution, if no packages got consumed and no frames are available return result; } } else if (result == AVERROR_EOF) { - *gotFramePtr = 0; // the last frame has been flushed + *gotFrame = false; // the last frame has been flushed // precaution, if no more frames are available assume we consume all bytes - consumed = packet ? packet->size : 0; + consumed = 0; } else { // error LOG(ERROR) << "avcodec_receive_frame failed, err: " << Util::generateErrorDesc(result); @@ -120,46 +132,121 @@ int Stream::analyzePacket(const AVPacket* packet, int* gotFramePtr) { return consumed; } -int Stream::decodeFrame(const AVPacket* packet, int* gotFramePtr) { - return analyzePacket(packet, gotFramePtr); -} - -int Stream::getFrameBytes(DecoderOutputMessage* out, bool headerOnly) { - return fillBuffer(out, false, headerOnly); +int Stream::decodePacket( + const AVPacket* packet, + DecoderOutputMessage* out, + bool headerOnly, + bool* hasMsg) { + int consumed; + bool gotFrame = false; + *hasMsg = false; + if ((consumed = analyzePacket(packet, &gotFrame)) >= 0 && + (packet == nullptr || gotFrame)) { + int result; + if ((result = getMessage(out, !gotFrame, headerOnly)) < 0) { + return result; // report error + } + *hasMsg = result > 0; + } + return consumed; } int Stream::flush(DecoderOutputMessage* out, bool headerOnly) { - int gotFramePtr = 0; - int result; - if (analyzePacket(nullptr, &gotFramePtr) >= 0 && gotFramePtr && - (result = fillBuffer(out, false, headerOnly)) > 0) { - return result; - } else if ((result = fillBuffer(out, true, headerOnly)) > 0) { + bool hasMsg = false; + int result = decodePacket(nullptr, out, headerOnly, &hasMsg); + if (result < 0) { + avcodec_flush_buffers(codecCtx_); return result; } - return result; + if (!hasMsg) { + avcodec_flush_buffers(codecCtx_); + return 0; + } + return 1; } -int Stream::fillBuffer(DecoderOutputMessage* out, bool flush, bool headerOnly) { - int result = -1; - if (!codecCtx_) { - LOG(INFO) << "Codec is not initialized"; - return result; +int Stream::getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly) { + if (flush) { + // only flush of audio frames makes sense + if (format_.type == TYPE_AUDIO) { + int bytes = 0; + if ((bytes = estimateBytes(true)) < 0) { + return bytes; + } + int processed = 0; + // grab all audio bytes by chunks + do { + out->payload->ensure(out->payload->length() + bytes); + if ((processed = copyFrameBytes(out->payload.get(), true)) < 0) { + return processed; + } + } while (processed); + + if (out->payload->length()) { + // set header first + setHeader(&out->header, flush); + return 1; + } + } + return 0; + } else { + // set header first + setHeader(&out->header, flush); + + if (headerOnly) { + // Only header is requisted + return 1; + } + + // decoded frame is available + int bytes; + if ((bytes = estimateBytes(false)) < 0) { + return bytes; + } + out->payload->ensure(bytes); + return copyFrameBytes(out->payload.get(), false); } +} + +void Stream::setHeader(DecoderHeader* header, bool flush) { + header->seqno = numGenerator_++; - // assign message - setHeader(&out->header); + setFramePts(header, flush); - if (headerOnly) { - return sizeof(out->header); + if (convertPtsToWallTime_) { + keeper_.adjust(header->pts); } - // init sampler, if any and return required bytes - if ((result = estimateBytes(flush)) < 0) { - return result; + header->format = format_; + header->keyFrame = 0; + header->fps = std::numeric_limits::quiet_NaN(); +} + +void Stream::setFramePts(DecoderHeader* header, bool flush) { + if (flush) { + header->pts = nextPts_; // already in us + } else { + header->pts = av_frame_get_best_effort_timestamp(frame_); + if (header->pts == AV_NOPTS_VALUE) { + header->pts = nextPts_; + } else { + header->pts = av_rescale_q( + header->pts, + inputCtx_->streams[format_.stream]->time_base, + AV_TIME_BASE_Q); + } + + switch (format_.type) { + case TYPE_AUDIO: + nextPts_ = header->pts + frame_->nb_samples * AV_TIME_BASE / fps_; + break; + case TYPE_VIDEO: + nextPts_ = header->pts + AV_TIME_BASE / fps_; + break; + default: + nextPts_ = header->pts; + } } - out->payload->ensure(result); - return copyFrameBytes(out->payload.get(), flush); } } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/stream.h b/torchvision/csrc/cpu/decoder/stream.h index fd83b90428c..3473a2a0fd3 100644 --- a/torchvision/csrc/cpu/decoder/stream.h +++ b/torchvision/csrc/cpu/decoder/stream.h @@ -1,9 +1,8 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include #include "defs.h" +#include "time_keeper.h" extern "C" { #include @@ -22,23 +21,24 @@ class Stream { Stream( AVFormatContext* inputCtx, MediaFormat format, - bool convertPtsToWallTime); + bool convertPtsToWallTime, + int64_t loggingUuid); virtual ~Stream(); // returns 0 - on success or negative error int openCodec(); - // returns number processed bytes from packet, or negative error - int decodeFrame(const AVPacket* packet, int* gotFramePtr); + // returns 1 - if packet got consumed, 0 - if it's not, and < 0 on error + int decodePacket( + const AVPacket* packet, + DecoderOutputMessage* out, + bool headerOnly, + bool* hasMsg); // returns stream index int getIndex() const { return format_.stream; } - // returns number decoded/sampled bytes - int getFrameBytes(DecoderOutputMessage* out, bool headerOnly); - // returns number decoded/sampled bytes + // returns 1 - if message got a payload, 0 - if it's not, and < 0 on error int flush(DecoderOutputMessage* out, bool headerOnly); - // rescale package - void rescalePackage(AVPacket* packet); // return media format MediaFormat getMediaFormat() const { return format_; @@ -46,29 +46,37 @@ class Stream { protected: virtual int initFormat() = 0; + // returns 1 - if packet got consumed, 0 - if it's not, and < 0 on error + virtual int analyzePacket(const AVPacket* packet, bool* gotFrame); // returns number processed bytes from packet, or negative error - virtual int analyzePacket(const AVPacket* packet, int* gotFramePtr); - // returns number decoded/sampled bytes, or negative error virtual int copyFrameBytes(ByteStorage* out, bool flush) = 0; - // initialize codec, returns output buffer size, or negative error + // estimates bytes in frame, returns output buffer size, or negative error virtual int estimateBytes(bool flush) = 0; // sets output format - virtual void setHeader(DecoderHeader* header) = 0; + virtual void setHeader(DecoderHeader* header, bool flush); + // set frame pts + virtual void setFramePts(DecoderHeader* header, bool flush); // finds codec virtual AVCodec* findCodec(AVCodecContext* ctx); private: - int fillBuffer(DecoderOutputMessage* out, bool flush, bool headerOnly); + // returns 1 - if message got a payload, 0 - if it's not, and < 0 on error + int getMessage(DecoderOutputMessage* out, bool flush, bool headerOnly); protected: AVFormatContext* const inputCtx_; MediaFormat format_; const bool convertPtsToWallTime_; + int64_t loggingUuid_; AVCodecContext* codecCtx_{nullptr}; AVFrame* frame_{nullptr}; std::atomic numGenerator_{0}; + TimeKeeper keeper_; + // estimated next frame pts for flushing the last frame + int64_t nextPts_{0}; + double fps_{30.}; }; } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/subtitle_sampler.cpp b/torchvision/csrc/cpu/decoder/subtitle_sampler.cpp index 02859c19187..b89ef8f1b86 100644 --- a/torchvision/csrc/cpu/decoder/subtitle_sampler.cpp +++ b/torchvision/csrc/cpu/decoder/subtitle_sampler.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "subtitle_sampler.h" #include "util.h" diff --git a/torchvision/csrc/cpu/decoder/subtitle_sampler.h b/torchvision/csrc/cpu/decoder/subtitle_sampler.h index 4846fe4d7c5..298e48d591f 100644 --- a/torchvision/csrc/cpu/decoder/subtitle_sampler.h +++ b/torchvision/csrc/cpu/decoder/subtitle_sampler.h @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "defs.h" diff --git a/torchvision/csrc/cpu/decoder/subtitle_stream.cpp b/torchvision/csrc/cpu/decoder/subtitle_stream.cpp index b699a0507cf..4f83fad68f8 100644 --- a/torchvision/csrc/cpu/decoder/subtitle_stream.cpp +++ b/torchvision/csrc/cpu/decoder/subtitle_stream.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "subtitle_stream.h" #include #include @@ -26,7 +24,8 @@ SubtitleStream::SubtitleStream( : Stream( inputCtx, MediaFormat::makeMediaFormat(format, index), - convertPtsToWallTime) { + convertPtsToWallTime, + 0) { memset(&sub_, 0, sizeof(sub_)); } @@ -51,16 +50,16 @@ int SubtitleStream::initFormat() { return 0; } -int SubtitleStream::analyzePacket(const AVPacket* packet, int* gotFramePtr) { +int SubtitleStream::analyzePacket(const AVPacket* packet, bool* gotFrame) { // clean-up releaseSubtitle(); // check flush packet AVPacket avPacket; av_init_packet(&avPacket); avPacket.data = nullptr; - auto pkt = packet ? *packet : avPacket; - int result = avcodec_decode_subtitle2(codecCtx_, &sub_, gotFramePtr, &pkt); + int gotFramePtr = 0; + int result = avcodec_decode_subtitle2(codecCtx_, &sub_, &gotFramePtr, &pkt); if (result < 0) { VLOG(1) << "avcodec_decode_subtitle2 failed, err: " @@ -69,17 +68,18 @@ int SubtitleStream::analyzePacket(const AVPacket* packet, int* gotFramePtr) { result = packet ? packet->size : 0; // discard the rest of the package } - sub_.release = *gotFramePtr; + sub_.release = gotFramePtr; + *gotFrame = gotFramePtr > 0; return result; } -int SubtitleStream::estimateBytes(bool flush) { +int SubtitleStream::estimateBytes(bool) { if (!(sampler_.getInputFormat().subtitle == *codecCtx_)) { // - reinit sampler SamplerParameters params; params.type = MediaType::TYPE_SUBTITLE; toSubtitleFormat(params.in.subtitle, *codecCtx_); - if (flush || !sampler_.init(params)) { + if (!sampler_.init(params)) { return -1; } @@ -92,17 +92,8 @@ int SubtitleStream::copyFrameBytes(ByteStorage* out, bool flush) { return sampler_.sample(flush ? nullptr : &sub_, out); } -void SubtitleStream::setHeader(DecoderHeader* header) { - header->seqno = numGenerator_++; - +void SubtitleStream::setFramePts(DecoderHeader* header, bool) { header->pts = sub_.pts; // already in us - - if (convertPtsToWallTime_) { - keeper_.adjust(header->pts); - } - - header->keyFrame = 0; - header->fps = std::numeric_limits::quiet_NaN(); - header->format = format_; } + } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/subtitle_stream.h b/torchvision/csrc/cpu/decoder/subtitle_stream.h index 8669f15e0ce..4297cfa83f7 100644 --- a/torchvision/csrc/cpu/decoder/subtitle_stream.h +++ b/torchvision/csrc/cpu/decoder/subtitle_stream.h @@ -1,10 +1,7 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "stream.h" #include "subtitle_sampler.h" -#include "time_keeper.h" namespace ffmpeg { @@ -25,18 +22,17 @@ class SubtitleStream : public Stream { ~SubtitleStream() override; protected: - void setHeader(DecoderHeader* header) override; + void setFramePts(DecoderHeader* header, bool flush) override; private: int initFormat() override; - int analyzePacket(const AVPacket* packet, int* gotFramePtr) override; + int analyzePacket(const AVPacket* packet, bool* gotFrame) override; int estimateBytes(bool flush) override; int copyFrameBytes(ByteStorage* out, bool flush) override; void releaseSubtitle(); private: SubtitleSampler sampler_; - TimeKeeper keeper_; AVSubtitleKeeper sub_; }; diff --git a/torchvision/csrc/cpu/decoder/sync_decoder.cpp b/torchvision/csrc/cpu/decoder/sync_decoder.cpp index 6387837218e..5f3c38e08f8 100644 --- a/torchvision/csrc/cpu/decoder/sync_decoder.cpp +++ b/torchvision/csrc/cpu/decoder/sync_decoder.cpp @@ -1,23 +1,26 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "sync_decoder.h" #include namespace ffmpeg { SyncDecoder::VectorByteStorage::VectorByteStorage(size_t n) { - buffer_.resize(n); + ensure(n); +} + +SyncDecoder::VectorByteStorage::~VectorByteStorage() { + av_free(buffer_); } void SyncDecoder::VectorByteStorage::ensure(size_t n) { if (tail() < n) { - buffer_.resize(offset_ + length_ + n); + capacity_ = offset_ + length_ + n; + buffer_ = static_cast(av_realloc(buffer_, capacity_)); } } uint8_t* SyncDecoder::VectorByteStorage::writableTail() { - CHECK_LE(offset_ + length_, buffer_.size()); - return buffer_.data() + offset_ + length_; + CHECK_LE(offset_ + length_, capacity_); + return buffer_ + offset_ + length_; } void SyncDecoder::VectorByteStorage::append(size_t n) { @@ -32,7 +35,7 @@ void SyncDecoder::VectorByteStorage::trim(size_t n) { } const uint8_t* SyncDecoder::VectorByteStorage::data() const { - return buffer_.data() + offset_; + return buffer_ + offset_; } size_t SyncDecoder::VectorByteStorage::length() const { @@ -40,13 +43,11 @@ size_t SyncDecoder::VectorByteStorage::length() const { } size_t SyncDecoder::VectorByteStorage::tail() const { - auto size = buffer_.size(); - CHECK_LE(offset_ + length_, buffer_.size()); - return size - offset_ - length_; + CHECK_LE(offset_ + length_, capacity_); + return capacity_ - offset_ - length_; } void SyncDecoder::VectorByteStorage::clear() { - buffer_.clear(); offset_ = 0; length_ = 0; } @@ -66,16 +67,22 @@ int SyncDecoder::decode(DecoderOutputMessage* out, uint64_t timeoutMs) { } if (queue_.empty()) { - int result = getBytes(timeoutMs); + int result = getFrame(timeoutMs); + // assign EOF eof_ = result == ENODATA; - + // check unrecoverable error, any error but ENODATA if (result && result != ENODATA) { return result; } // still empty if (queue_.empty()) { - return ETIMEDOUT; + if (eof_) { + return ENODATA; + } else { + LOG(INFO) << "Queue is empty"; + return ETIMEDOUT; + } } } diff --git a/torchvision/csrc/cpu/decoder/sync_decoder.h b/torchvision/csrc/cpu/decoder/sync_decoder.h index 76c347fe707..192962acc0c 100644 --- a/torchvision/csrc/cpu/decoder/sync_decoder.h +++ b/torchvision/csrc/cpu/decoder/sync_decoder.h @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include @@ -13,9 +11,11 @@ namespace ffmpeg { * or fetched internally by FFMPEG library */ class SyncDecoder : public Decoder { + // Allocation of memory must be done with a proper alignment. class VectorByteStorage : public ByteStorage { public: VectorByteStorage(size_t n); + ~VectorByteStorage() override; void ensure(size_t n) override; uint8_t* writableTail() override; void append(size_t n) override; @@ -28,7 +28,8 @@ class SyncDecoder : public Decoder { private: size_t offset_{0}; size_t length_{0}; - std::vector buffer_; + size_t capacity_{0}; + uint8_t* buffer_{nullptr}; }; public: diff --git a/torchvision/csrc/cpu/decoder/sync_decoder_test.cpp b/torchvision/csrc/cpu/decoder/sync_decoder_test.cpp index ee0fe3fcf3c..379c24a0aa0 100644 --- a/torchvision/csrc/cpu/decoder/sync_decoder_test.cpp +++ b/torchvision/csrc/cpu/decoder/sync_decoder_test.cpp @@ -1,7 +1,6 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include #include +#include "memory_buffer.h" #include "sync_decoder.h" using namespace ffmpeg; @@ -10,7 +9,8 @@ TEST(SyncDecoder, Test) { SyncDecoder decoder; DecoderParameters params; params.timeoutMs = 10000; - params.startOffsetMs = 1000; + params.startOffset = 1000000; + params.seekAccuracy = 100000; params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4"; CHECK(decoder.init(params, nullptr)); @@ -20,3 +20,136 @@ TEST(SyncDecoder, Test) { } decoder.shutdown(); } + +TEST(SyncDecoder, TestHeadersOnly) { + SyncDecoder decoder; + DecoderParameters params; + params.timeoutMs = 10000; + params.startOffset = 1000000; + params.seekAccuracy = 100000; + params.headerOnly = true; + params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; + params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4"; + CHECK(decoder.init(params, nullptr)); + DecoderOutputMessage out; + while (0 == decoder.decode(&out, 100)) { + LOG(INFO) << "Decoded frame, type: " << out.header.format.type + << ", timestamp(us): " << out.header.pts; + } + decoder.shutdown(); +} + +TEST(SyncDecoder, TestMemoryBuffer) { + SyncDecoder decoder; + DecoderParameters params; + params.timeoutMs = 10000; + params.startOffset = 1000000; + params.endOffset = 9000000; + params.seekAccuracy = 10000; + params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; + + FILE* f = fopen( + "pytorch/vision/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi", + "rb"); + CHECK(f != nullptr); + fseek(f, 0, SEEK_END); + std::vector buffer(ftell(f)); + rewind(f); + CHECK_EQ(buffer.size(), fread(buffer.data(), 1, buffer.size(), f)); + fclose(f); + CHECK(decoder.init( + params, MemoryBuffer::getCallback(buffer.data(), buffer.size()))); + LOG(INFO) << "Decoding from memory bytes: " << buffer.size(); + DecoderOutputMessage out; + size_t audioFrames = 0, videoFrames = 0; + while (0 == decoder.decode(&out, 100)) { + if (out.header.format.type == TYPE_AUDIO) { + ++audioFrames; + } else if (out.header.format.type == TYPE_VIDEO) { + ++videoFrames; + } + } + LOG(INFO) << "Decoded audio frames: " << audioFrames + << ", video frames: " << videoFrames; + decoder.shutdown(); +} + +TEST(SyncDecoder, TestMemoryBufferNoSeekableWithFullRead) { + SyncDecoder decoder; + DecoderParameters params; + params.timeoutMs = 10000; + params.startOffset = 1000000; + params.endOffset = 9000000; + params.seekAccuracy = 10000; + params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; + + FILE* f = fopen("pytorch/vision/test/assets/videos/R6llTwEh07w.mp4", "rb"); + CHECK(f != nullptr); + fseek(f, 0, SEEK_END); + std::vector buffer(ftell(f)); + rewind(f); + CHECK_EQ(buffer.size(), fread(buffer.data(), 1, buffer.size(), f)); + fclose(f); + + params.maxSeekableBytes = buffer.size() + 1; + MemoryBuffer object(buffer.data(), buffer.size()); + CHECK(decoder.init( + params, + [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable + -> int { + if (out) { // see defs.h file + // read mode + return object.read(out, size); + } + // seek mode + if (!timeoutMs) { + // seek capabilty, yes - no + return -1; + } + return object.seek(size, whence); + })); + DecoderOutputMessage out; + while (0 == decoder.decode(&out, 100)) { + LOG(INFO) << "Decoded frame, timestamp(us): " << out.header.pts + << ", num: " << out.header.format.num + << ", den: " << out.header.format.den + << ", duration(us): " << out.header.format.duration; + } + decoder.shutdown(); +} + +TEST(SyncDecoder, TestMemoryBufferNoSeekableWithPartialRead) { + SyncDecoder decoder; + DecoderParameters params; + params.timeoutMs = 10000; + params.startOffset = 1000000; + params.endOffset = 9000000; + params.seekAccuracy = 10000; + params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; + + FILE* f = fopen("pytorch/vision/test/assets/videos/R6llTwEh07w.mp4", "rb"); + CHECK(f != nullptr); + fseek(f, 0, SEEK_END); + std::vector buffer(ftell(f)); + rewind(f); + CHECK_EQ(buffer.size(), fread(buffer.data(), 1, buffer.size(), f)); + fclose(f); + + params.maxSeekableBytes = buffer.size() / 2; + MemoryBuffer object(buffer.data(), buffer.size()); + CHECK(!decoder.init( + params, + [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable + -> int { + if (out) { // see defs.h file + // read mode + return object.read(out, size); + } + // seek mode + if (!timeoutMs) { + // seek capabilty, yes - no + return -1; + } + return object.seek(size, whence); + })); +} diff --git a/torchvision/csrc/cpu/decoder/time_keeper.cpp b/torchvision/csrc/cpu/decoder/time_keeper.cpp index a0da56a1f64..9cfc9457963 100644 --- a/torchvision/csrc/cpu/decoder/time_keeper.cpp +++ b/torchvision/csrc/cpu/decoder/time_keeper.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "time_keeper.h" extern "C" { @@ -9,13 +7,13 @@ extern "C" { namespace ffmpeg { namespace { -const ssize_t kMaxTimeBaseDiference = 10; +const long kMaxTimeBaseDiference = 10; } -ssize_t TimeKeeper::adjust(ssize_t& decoderTimestamp) { - const ssize_t now = std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()) - .count(); +long TimeKeeper::adjust(long& decoderTimestamp) { + const long now = std::chrono::duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); if (startTime_ == 0) { startTime_ = now; diff --git a/torchvision/csrc/cpu/decoder/time_keeper.h b/torchvision/csrc/cpu/decoder/time_keeper.h index c9d06025b2c..e4d4718c705 100644 --- a/torchvision/csrc/cpu/decoder/time_keeper.h +++ b/torchvision/csrc/cpu/decoder/time_keeper.h @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include @@ -17,11 +15,11 @@ class TimeKeeper { // adjust provided @timestamp to the corrected value // return advised sleep time before next frame processing in (us) - ssize_t adjust(ssize_t& decoderTimestamp); + long adjust(long& decoderTimestamp); private: - ssize_t startTime_{0}; - ssize_t streamTimestamp_{0}; + long startTime_{0}; + long streamTimestamp_{0}; }; } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/util.cpp b/torchvision/csrc/cpu/decoder/util.cpp index 6ae888838ea..ba19cf582b0 100644 --- a/torchvision/csrc/cpu/decoder/util.cpp +++ b/torchvision/csrc/cpu/decoder/util.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "util.h" #include diff --git a/torchvision/csrc/cpu/decoder/util.h b/torchvision/csrc/cpu/decoder/util.h index 6a985d78559..cc64d8944e4 100644 --- a/torchvision/csrc/cpu/decoder/util.h +++ b/torchvision/csrc/cpu/decoder/util.h @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "defs.h" diff --git a/torchvision/csrc/cpu/decoder/video_sampler.cpp b/torchvision/csrc/cpu/decoder/video_sampler.cpp index 1a91c82a371..4b7d078ebd7 100644 --- a/torchvision/csrc/cpu/decoder/video_sampler.cpp +++ b/torchvision/csrc/cpu/decoder/video_sampler.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "video_sampler.h" #include #include "util.h" diff --git a/torchvision/csrc/cpu/decoder/video_sampler.h b/torchvision/csrc/cpu/decoder/video_sampler.h index 73997c213e1..85161307257 100644 --- a/torchvision/csrc/cpu/decoder/video_sampler.h +++ b/torchvision/csrc/cpu/decoder/video_sampler.h @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "defs.h" diff --git a/torchvision/csrc/cpu/decoder/video_stream.cpp b/torchvision/csrc/cpu/decoder/video_stream.cpp index 9c6b77d0bfc..e464ed30cc9 100644 --- a/torchvision/csrc/cpu/decoder/video_stream.cpp +++ b/torchvision/csrc/cpu/decoder/video_stream.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "video_stream.h" #include #include "util.h" @@ -11,12 +9,23 @@ bool operator==(const VideoFormat& x, const AVFrame& y) { return x.width == y.width && x.height == y.height && x.format == y.format; } +bool operator==(const VideoFormat& x, const AVCodecContext& y) { + return x.width == y.width && x.height == y.height && x.format == y.pix_fmt; +} + VideoFormat& toVideoFormat(VideoFormat& x, const AVFrame& y) { x.width = y.width; x.height = y.height; x.format = y.format; return x; } + +VideoFormat& toVideoFormat(VideoFormat& x, const AVCodecContext& y) { + x.width = y.width; + x.height = y.height; + x.format = y.pix_fmt; + return x; +} } // namespace VideoStream::VideoStream( @@ -28,8 +37,8 @@ VideoStream::VideoStream( : Stream( inputCtx, MediaFormat::makeMediaFormat(format, index), - convertPtsToWallTime), - loggingUuid_(loggingUuid) {} + convertPtsToWallTime, + loggingUuid) {} VideoStream::~VideoStream() { if (sampler_) { @@ -79,12 +88,14 @@ int VideoStream::initFormat() { int VideoStream::estimateBytes(bool flush) { ensureSampler(); // check if input format gets changed - if (!flush && !(sampler_->getInputFormat().video == *frame_)) { + if (flush ? !(sampler_->getInputFormat().video == *codecCtx_) + : !(sampler_->getInputFormat().video == *frame_)) { // - reinit sampler SamplerParameters params; params.type = format_.type; params.out = format_.format; - toVideoFormat(params.in.video, *frame_); + flush ? toVideoFormat(params.in.video, *codecCtx_) + : toVideoFormat(params.in.video, *frame_); if (!sampler_->init(params)) { return -1; } @@ -108,36 +119,13 @@ int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) { return sampler_->sample(flush ? nullptr : frame_, out); } -void VideoStream::setHeader(DecoderHeader* header) { - header->seqno = numGenerator_++; - - if (codecCtx_->time_base.num != 0) { - header->pts = av_rescale_q( - av_frame_get_best_effort_timestamp(frame_), - codecCtx_->time_base, - AV_TIME_BASE_Q); - } else { - // If the codec time_base is missing then we would've skipped the - // rescalePackage step to rescale to codec time_base, so here we can - // rescale straight from the stream time_base into AV_TIME_BASE_Q. - header->pts = av_rescale_q( - av_frame_get_best_effort_timestamp(frame_), - inputCtx_->streams[format_.stream]->time_base, - AV_TIME_BASE_Q); - } - - if (convertPtsToWallTime_) { - keeper_.adjust(header->pts); - } - - header->keyFrame = frame_->key_frame; - auto fpsRational = inputCtx_->streams[format_.stream]->avg_frame_rate; - if (fpsRational.den) { - header->fps = av_q2d(fpsRational); - } else { - header->fps = std::numeric_limits::quiet_NaN(); +void VideoStream::setHeader(DecoderHeader* header, bool flush) { + Stream::setHeader(header, flush); + if (!flush) { // no frames for video flush + header->keyFrame = frame_->key_frame; + header->fps = av_q2d(av_guess_frame_rate( + inputCtx_, inputCtx_->streams[format_.stream], nullptr)); } - header->format = format_; } } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/video_stream.h b/torchvision/csrc/cpu/decoder/video_stream.h index af1e3fb960f..8e73d099613 100644 --- a/torchvision/csrc/cpu/decoder/video_stream.h +++ b/torchvision/csrc/cpu/decoder/video_stream.h @@ -1,9 +1,6 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "stream.h" -#include "time_keeper.h" #include "video_sampler.h" namespace ffmpeg { @@ -19,21 +16,19 @@ class VideoStream : public Stream { int index, bool convertPtsToWallTime, const VideoFormat& format, - int64_t loggingUuid = 0); + int64_t loggingUuid); ~VideoStream() override; private: int initFormat() override; int estimateBytes(bool flush) override; int copyFrameBytes(ByteStorage* out, bool flush) override; - void setHeader(DecoderHeader* header) override; + void setHeader(DecoderHeader* header, bool flush) override; void ensureSampler(); private: std::unique_ptr sampler_; - TimeKeeper keeper_; - int64_t loggingUuid_{0}; }; } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.cpp b/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.cpp deleted file mode 100644 index 24aecacf946..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.cpp +++ /dev/null @@ -1,118 +0,0 @@ -#include "FfmpegAudioSampler.h" -#include -#include "FfmpegUtil.h" - -using namespace std; - -FfmpegAudioSampler::FfmpegAudioSampler( - const AudioFormat& in, - const AudioFormat& out) - : inFormat_(in), outFormat_(out) {} - -FfmpegAudioSampler::~FfmpegAudioSampler() { - if (swrContext_) { - swr_free(&swrContext_); - } -} - -int FfmpegAudioSampler::init() { - swrContext_ = swr_alloc_set_opts( - nullptr, // we're allocating a new context - av_get_default_channel_layout(outFormat_.channels), // out_ch_layout - static_cast(outFormat_.format), // out_sample_fmt - outFormat_.samples, // out_sample_rate - av_get_default_channel_layout(inFormat_.channels), // in_ch_layout - static_cast(inFormat_.format), // in_sample_fmt - inFormat_.samples, // in_sample_rate - 0, // log_offset - nullptr); // log_ctx - if (swrContext_ == nullptr) { - LOG(ERROR) << "swr_alloc_set_opts fails"; - return -1; - } - int result = 0; - if ((result = swr_init(swrContext_)) < 0) { - LOG(ERROR) << "swr_init failed, err: " << ffmpeg_util::getErrorDesc(result) - << ", in -> format: " << inFormat_.format - << ", channels: " << inFormat_.channels - << ", samples: " << inFormat_.samples - << ", out -> format: " << outFormat_.format - << ", channels: " << outFormat_.channels - << ", samples: " << outFormat_.samples; - return -1; - } - return 0; -} - -int64_t FfmpegAudioSampler::getSampleBytes(const AVFrame* frame) const { - auto outSamples = getOutNumSamples(frame->nb_samples); - - return av_samples_get_buffer_size( - nullptr, - outFormat_.channels, - outSamples, - static_cast(outFormat_.format), - 1); -} - -// https://www.ffmpeg.org/doxygen/3.2/group__lswr.html -unique_ptr FfmpegAudioSampler::sample(const AVFrame* frame) { - if (!frame) { - return nullptr; // no flush for videos - } - - auto inNumSamples = frame->nb_samples; - auto outNumSamples = getOutNumSamples(frame->nb_samples); - - auto outSampleSize = getSampleBytes(frame); - AvDataPtr frameData(static_cast(av_malloc(outSampleSize))); - - uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS]; - int result = 0; - if ((result = av_samples_fill_arrays( - outPlanes, - nullptr, // linesize is not needed - frameData.get(), - outFormat_.channels, - outNumSamples, - static_cast(outFormat_.format), - 1)) < 0) { - LOG(ERROR) << "av_samples_fill_arrays failed, err: " - << ffmpeg_util::getErrorDesc(result) - << ", outNumSamples: " << outNumSamples - << ", format: " << outFormat_.format; - return nullptr; - } - - if ((result = swr_convert( - swrContext_, - &outPlanes[0], - outNumSamples, - (const uint8_t**)&frame->data[0], - inNumSamples)) < 0) { - LOG(ERROR) << "swr_convert faield, err: " - << ffmpeg_util::getErrorDesc(result); - return nullptr; - } - // result returned by swr_convert is the No. of actual output samples. - // So update the buffer size using av_samples_get_buffer_size - result = av_samples_get_buffer_size( - nullptr, - outFormat_.channels, - result, - static_cast(outFormat_.format), - 1); - - return make_unique(std::move(frameData), result, 0); -} -/* -Because of decoding delay, the returned value is an upper bound of No. of -output samples -*/ -int64_t FfmpegAudioSampler::getOutNumSamples(int inNumSamples) const { - return av_rescale_rnd( - swr_get_delay(swrContext_, inFormat_.samples) + inNumSamples, - outFormat_.samples, - inFormat_.samples, - AV_ROUND_UP); -} diff --git a/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.h b/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.h deleted file mode 100644 index 767a5ca6e4f..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.h +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once - -#include "FfmpegSampler.h" - -#define AVRESAMPLE_MAX_CHANNELS 32 - -/** - * Class transcode audio frames from one format into another - */ -class FfmpegAudioSampler : public FfmpegSampler { - public: - explicit FfmpegAudioSampler(const AudioFormat& in, const AudioFormat& out); - ~FfmpegAudioSampler() override; - - int init() override; - - int64_t getSampleBytes(const AVFrame* frame) const; - // FfmpegSampler overrides - // returns number of bytes of the sampled data - std::unique_ptr sample(const AVFrame* frame) override; - - const AudioFormat& getInFormat() const { - return inFormat_; - } - - private: - int64_t getOutNumSamples(int inNumSamples) const; - - AudioFormat inFormat_; - AudioFormat outFormat_; - SwrContext* swrContext_{nullptr}; -}; diff --git a/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.cpp b/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.cpp deleted file mode 100644 index b5b1e2fbda5..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.cpp +++ /dev/null @@ -1,103 +0,0 @@ -#include "FfmpegAudioStream.h" -#include "FfmpegUtil.h" - -using namespace std; - -namespace { - -bool operator==(const AudioFormat& x, const AVCodecContext& y) { - return x.samples == y.sample_rate && x.channels == y.channels && - x.format == y.sample_fmt; -} - -AudioFormat& toAudioFormat( - AudioFormat& audioFormat, - const AVCodecContext& codecCtx) { - audioFormat.samples = codecCtx.sample_rate; - audioFormat.channels = codecCtx.channels; - audioFormat.format = codecCtx.sample_fmt; - - return audioFormat; -} - -} // namespace - -FfmpegAudioStream::FfmpegAudioStream( - AVFormatContext* inputCtx, - int index, - enum AVMediaType avMediaType, - MediaFormat mediaFormat, - double seekFrameMargin) - : FfmpegStream(inputCtx, index, avMediaType, seekFrameMargin), - mediaFormat_(mediaFormat) {} - -FfmpegAudioStream::~FfmpegAudioStream() {} - -void FfmpegAudioStream::checkStreamDecodeParams() { - auto timeBase = getTimeBase(); - if (timeBase.first > 0) { - CHECK_EQ(timeBase.first, inputCtx_->streams[index_]->time_base.num); - CHECK_EQ(timeBase.second, inputCtx_->streams[index_]->time_base.den); - } -} - -void FfmpegAudioStream::updateStreamDecodeParams() { - auto timeBase = getTimeBase(); - if (timeBase.first == 0) { - mediaFormat_.format.audio.timeBaseNum = - inputCtx_->streams[index_]->time_base.num; - mediaFormat_.format.audio.timeBaseDen = - inputCtx_->streams[index_]->time_base.den; - } - mediaFormat_.format.audio.duration = inputCtx_->streams[index_]->duration; -} - -int FfmpegAudioStream::initFormat() { - AudioFormat& format = mediaFormat_.format.audio; - - if (format.samples == 0) { - format.samples = codecCtx_->sample_rate; - } - if (format.channels == 0) { - format.channels = codecCtx_->channels; - } - if (format.format == AV_SAMPLE_FMT_NONE) { - format.format = codecCtx_->sample_fmt; - VLOG(2) << "set stream format sample_fmt: " << format.format; - } - - checkStreamDecodeParams(); - - updateStreamDecodeParams(); - - if (format.samples > 0 && format.channels > 0 && - format.format != AV_SAMPLE_FMT_NONE) { - return 0; - } else { - return -1; - } -} - -unique_ptr FfmpegAudioStream::sampleFrameData() { - AudioFormat& audioFormat = mediaFormat_.format.audio; - - if (!sampler_ || !(sampler_->getInFormat() == *codecCtx_)) { - AudioFormat newInFormat; - newInFormat = toAudioFormat(newInFormat, *codecCtx_); - sampler_ = make_unique(newInFormat, audioFormat); - VLOG(1) << "Set sampler input audio format" - << ", samples: " << newInFormat.samples - << ", channels: " << newInFormat.channels - << ", format: " << newInFormat.format - << " : output audio sampler format" - << ", samples: " << audioFormat.samples - << ", channels: " << audioFormat.channels - << ", format: " << audioFormat.format; - int ret = sampler_->init(); - if (ret < 0) { - VLOG(1) << "Fail to initialize audio sampler"; - return nullptr; - } - } - return sampler_->sample(frame_); -} diff --git a/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.h b/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.h deleted file mode 100644 index 1d4f7a2f2ee..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.h +++ /dev/null @@ -1,54 +0,0 @@ -#pragma once - -#include -#include "FfmpegAudioSampler.h" -#include "FfmpegStream.h" - -/** - * Class uses FFMPEG library to decode one video stream. - */ -class FfmpegAudioStream : public FfmpegStream { - public: - explicit FfmpegAudioStream( - AVFormatContext* inputCtx, - int index, - enum AVMediaType avMediaType, - MediaFormat mediaFormat, - double seekFrameMargin); - - ~FfmpegAudioStream() override; - - // FfmpegStream overrides - MediaType getMediaType() const override { - return MediaType::TYPE_AUDIO; - } - - FormatUnion getMediaFormat() const override { - return mediaFormat_.format; - } - - int64_t getStartPts() const override { - return mediaFormat_.format.audio.startPts; - } - int64_t getEndPts() const override { - return mediaFormat_.format.audio.endPts; - } - // return numerator and denominator of time base - std::pair getTimeBase() const { - return std::make_pair( - mediaFormat_.format.audio.timeBaseNum, - mediaFormat_.format.audio.timeBaseDen); - } - - void checkStreamDecodeParams(); - - void updateStreamDecodeParams(); - - protected: - int initFormat() override; - std::unique_ptr sampleFrameData() override; - - private: - MediaFormat mediaFormat_; - std::unique_ptr sampler_{nullptr}; -}; diff --git a/torchvision/csrc/cpu/video_reader/FfmpegDecoder.cpp b/torchvision/csrc/cpu/video_reader/FfmpegDecoder.cpp deleted file mode 100644 index fb4d302cc03..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegDecoder.cpp +++ /dev/null @@ -1,412 +0,0 @@ -#include "FfmpegDecoder.h" -#include "FfmpegAudioStream.h" -#include "FfmpegUtil.h" -#include "FfmpegVideoStream.h" - -using namespace std; - -static AVPacket avPkt; - -namespace { - -unique_ptr createFfmpegStream( - MediaType type, - AVFormatContext* ctx, - int idx, - MediaFormat& mediaFormat, - double seekFrameMargin) { - enum AVMediaType avType; - CHECK(ffmpeg_util::mapMediaType(type, &avType)); - switch (type) { - case MediaType::TYPE_VIDEO: - return make_unique( - ctx, idx, avType, mediaFormat, seekFrameMargin); - case MediaType::TYPE_AUDIO: - return make_unique( - ctx, idx, avType, mediaFormat, seekFrameMargin); - default: - return nullptr; - } -} - -} // namespace - -FfmpegAvioContext::FfmpegAvioContext() - : workBuffersize_(VIO_BUFFER_SZ), - workBuffer_((uint8_t*)av_malloc(workBuffersize_)), - inputFile_(nullptr), - inputBuffer_(nullptr), - inputBufferSize_(0) {} - -int FfmpegAvioContext::initAVIOContext(const uint8_t* buffer, int64_t size) { - inputBuffer_ = buffer; - inputBufferSize_ = size; - avioCtx_ = avio_alloc_context( - workBuffer_, - workBuffersize_, - 0, - reinterpret_cast(this), - &FfmpegAvioContext::readMemory, - nullptr, // no write function - &FfmpegAvioContext::seekMemory); - return 0; -} - -FfmpegAvioContext::~FfmpegAvioContext() { - /* note: the internal buffer could have changed, and be != workBuffer_ */ - if (avioCtx_) { - av_freep(&avioCtx_->buffer); - av_freep(&avioCtx_); - } else { - av_freep(&workBuffer_); - } - if (inputFile_) { - fclose(inputFile_); - } -} - -int FfmpegAvioContext::read(uint8_t* buf, int buf_size) { - if (inputBuffer_) { - return readMemory(this, buf, buf_size); - } else { - return -1; - } -} - -int FfmpegAvioContext::readMemory(void* opaque, uint8_t* buf, int buf_size) { - FfmpegAvioContext* h = static_cast(opaque); - if (buf_size < 0) { - return -1; - } - - int reminder = h->inputBufferSize_ - h->offset_; - int r = buf_size < reminder ? buf_size : reminder; - if (r < 0) { - return AVERROR_EOF; - } - - memcpy(buf, h->inputBuffer_ + h->offset_, r); - h->offset_ += r; - return r; -} - -int64_t FfmpegAvioContext::seek(int64_t offset, int whence) { - if (inputBuffer_) { - return seekMemory(this, offset, whence); - } else { - return -1; - } -} - -int64_t FfmpegAvioContext::seekMemory( - void* opaque, - int64_t offset, - int whence) { - FfmpegAvioContext* h = static_cast(opaque); - switch (whence) { - case SEEK_CUR: // from current position - h->offset_ += offset; - break; - case SEEK_END: // from eof - h->offset_ = h->inputBufferSize_ + offset; - break; - case SEEK_SET: // from beginning of file - h->offset_ = offset; - break; - case AVSEEK_SIZE: - return h->inputBufferSize_; - } - return h->offset_; -} - -int FfmpegDecoder::init( - const std::string& filename, - bool isDecodeFile, - FfmpegAvioContext& ioctx, - DecoderOutput& decoderOutput) { - cleanUp(); - - int ret = 0; - if (!isDecodeFile) { - formatCtx_ = avformat_alloc_context(); - if (!formatCtx_) { - LOG(ERROR) << "avformat_alloc_context failed"; - return -1; - } - formatCtx_->pb = ioctx.get_avio(); - formatCtx_->flags |= AVFMT_FLAG_CUSTOM_IO; - - // Determining the input format: - int probeSz = AVPROBE_SIZE + AVPROBE_PADDING_SIZE; - uint8_t* probe((uint8_t*)av_malloc(probeSz)); - memset(probe, 0, probeSz); - int len = ioctx.read(probe, probeSz - AVPROBE_PADDING_SIZE); - if (len < probeSz - AVPROBE_PADDING_SIZE) { - LOG(ERROR) << "Insufficient data to determine video format"; - av_freep(&probe); - return -1; - } - // seek back to start of stream - ioctx.seek(0, SEEK_SET); - - unique_ptr probeData(new AVProbeData()); - probeData->buf = probe; - probeData->buf_size = len; - probeData->filename = ""; - // Determine the input-format: - formatCtx_->iformat = av_probe_input_format(probeData.get(), 1); - // this is to avoid the double-free error - if (formatCtx_->iformat == nullptr) { - LOG(ERROR) << "av_probe_input_format fails"; - return -1; - } - VLOG(1) << "av_probe_input_format succeeds"; - av_freep(&probe); - - ret = avformat_open_input(&formatCtx_, "", nullptr, nullptr); - } else { - ret = avformat_open_input(&formatCtx_, filename.c_str(), nullptr, nullptr); - } - - if (ret < 0) { - LOG(ERROR) << "avformat_open_input failed, error: " - << ffmpeg_util::getErrorDesc(ret); - cleanUp(); - return ret; - } - ret = avformat_find_stream_info(formatCtx_, nullptr); - if (ret < 0) { - LOG(ERROR) << "avformat_find_stream_info failed, error: " - << ffmpeg_util::getErrorDesc(ret); - cleanUp(); - return ret; - } - if (!initStreams()) { - LOG(ERROR) << "Cannot activate streams"; - cleanUp(); - return -1; - } - - for (auto& stream : streams_) { - MediaType mediaType = stream.second->getMediaType(); - decoderOutput.initMediaType(mediaType, stream.second->getMediaFormat()); - } - VLOG(1) << "FfmpegDecoder initialized"; - return 0; -} - -int FfmpegDecoder::decodeFile( - unique_ptr params, - const string& fileName, - DecoderOutput& decoderOutput) { - VLOG(1) << "decode file: " << fileName; - FfmpegAvioContext ioctx; - int ret = decodeLoop(std::move(params), fileName, true, ioctx, decoderOutput); - return ret; -} - -int FfmpegDecoder::decodeMemory( - unique_ptr params, - const uint8_t* buffer, - int64_t size, - DecoderOutput& decoderOutput) { - VLOG(1) << "decode video data in memory"; - FfmpegAvioContext ioctx; - int ret = ioctx.initAVIOContext(buffer, size); - if (ret == 0) { - ret = - decodeLoop(std::move(params), string(""), false, ioctx, decoderOutput); - } - return ret; -} - -int FfmpegDecoder::probeFile( - unique_ptr params, - const string& fileName, - DecoderOutput& decoderOutput) { - VLOG(1) << "probe file: " << fileName; - FfmpegAvioContext ioctx; - return probeVideo(std::move(params), fileName, true, ioctx, decoderOutput); -} - -int FfmpegDecoder::probeMemory( - unique_ptr params, - const uint8_t* buffer, - int64_t size, - DecoderOutput& decoderOutput) { - VLOG(1) << "probe video data in memory"; - FfmpegAvioContext ioctx; - int ret = ioctx.initAVIOContext(buffer, size); - if (ret == 0) { - ret = - probeVideo(std::move(params), string(""), false, ioctx, decoderOutput); - } - return ret; -} - -void FfmpegDecoder::cleanUp() { - if (formatCtx_) { - for (auto& stream : streams_) { - // Drain stream buffers. - DecoderOutput decoderOutput; - stream.second->flush(1, decoderOutput); - stream.second.reset(); - } - streams_.clear(); - avformat_close_input(&formatCtx_); - } -} - -FfmpegStream* FfmpegDecoder::findStreamByIndex(int streamIndex) const { - auto it = streams_.find(streamIndex); - return it != streams_.end() ? it->second.get() : nullptr; -} - -/* -Reference implementation: -https://ffmpeg.org/doxygen/3.4/demuxing_decoding_8c-example.html -*/ -int FfmpegDecoder::decodeLoop( - unique_ptr params, - const std::string& filename, - bool isDecodeFile, - FfmpegAvioContext& ioctx, - DecoderOutput& decoderOutput) { - params_ = std::move(params); - - int ret = init(filename, isDecodeFile, ioctx, decoderOutput); - if (ret < 0) { - return ret; - } - // init package - av_init_packet(&avPkt); - avPkt.data = nullptr; - avPkt.size = 0; - - int result = 0; - bool ptsInRange = true; - while (ptsInRange) { - result = av_read_frame(formatCtx_, &avPkt); - if (result == AVERROR(EAGAIN)) { - VLOG(1) << "Decoder is busy"; - ret = 0; - break; - } else if (result == AVERROR_EOF) { - VLOG(1) << "Stream decoding is completed"; - ret = 0; - break; - } else if (result < 0) { - VLOG(1) << "av_read_frame fails. Break decoder loop. Error: " - << ffmpeg_util::getErrorDesc(result); - ret = result; - break; - } - - ret = 0; - auto stream = findStreamByIndex(avPkt.stream_index); - if (stream == nullptr) { - // the packet is from a stream the caller is not interested. Ignore it - VLOG(2) << "avPkt ignored. stream index: " << avPkt.stream_index; - // Need to free the memory of AVPacket. Otherwise, memory leak happens - av_packet_unref(&avPkt); - continue; - } - - do { - result = stream->sendPacket(&avPkt); - if (result == AVERROR(EAGAIN)) { - VLOG(2) << "avcodec_send_packet returns AVERROR(EAGAIN)"; - // start to recevie available frames from internal buffer - stream->receiveAvailFrames(params_->getPtsOnly, decoderOutput); - if (isPtsExceedRange()) { - // exit the most-outer while loop - VLOG(1) << "In all streams, exceed the end pts. Exit decoding loop"; - ret = 0; - ptsInRange = false; - break; - } - } else if (result < 0) { - LOG(WARNING) << "avcodec_send_packet failed. Error: " - << ffmpeg_util::getErrorDesc(result); - ret = result; - break; - } else { - VLOG(2) << "avcodec_send_packet succeeds"; - // succeed. Read the next AVPacket and send out it - break; - } - } while (ptsInRange); - // Need to free the memory of AVPacket. Otherwise, memory leak happens - av_packet_unref(&avPkt); - } - /* flush cached frames */ - flushStreams(decoderOutput); - return ret; -} - -int FfmpegDecoder::probeVideo( - unique_ptr params, - const std::string& filename, - bool isDecodeFile, - FfmpegAvioContext& ioctx, - DecoderOutput& decoderOutput) { - params_ = std::move(params); - return init(filename, isDecodeFile, ioctx, decoderOutput); -} - -bool FfmpegDecoder::initStreams() { - for (auto it = params_->formats.begin(); it != params_->formats.end(); ++it) { - AVMediaType mediaType; - if (!ffmpeg_util::mapMediaType(it->first, &mediaType)) { - LOG(ERROR) << "Unknown media type: " << it->first; - return false; - } - int streamIdx = - av_find_best_stream(formatCtx_, mediaType, -1, -1, nullptr, 0); - - if (streamIdx >= 0) { - VLOG(2) << "find stream index: " << streamIdx; - auto stream = createFfmpegStream( - it->first, - formatCtx_, - streamIdx, - it->second, - params_->seekFrameMargin); - - CHECK(stream); - if (stream->openCodecContext() < 0) { - LOG(ERROR) << "Cannot open codec. Stream index: " << streamIdx; - return false; - } - streams_.emplace(streamIdx, move(stream)); - } else { - VLOG(1) << "Cannot open find stream of type " << it->first; - } - } - // Seek frames in each stream - int ret = 0; - for (auto& stream : streams_) { - auto startPts = stream.second->getStartPts(); - VLOG(1) << "stream: " << stream.first << " startPts: " << startPts; - if (startPts > 0 && (ret = stream.second->seekFrame(startPts)) < 0) { - LOG(WARNING) << "seekFrame in stream fails"; - return false; - } - } - VLOG(1) << "initStreams succeeds"; - return true; -} - -bool FfmpegDecoder::isPtsExceedRange() { - bool exceed = true; - for (auto& stream : streams_) { - exceed = exceed && stream.second->isFramePtsExceedRange(); - } - return exceed; -} - -void FfmpegDecoder::flushStreams(DecoderOutput& decoderOutput) { - for (auto& stream : streams_) { - stream.second->flush(params_->getPtsOnly, decoderOutput); - } -} diff --git a/torchvision/csrc/cpu/video_reader/FfmpegDecoder.h b/torchvision/csrc/cpu/video_reader/FfmpegDecoder.h deleted file mode 100644 index a0a564a4214..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegDecoder.h +++ /dev/null @@ -1,127 +0,0 @@ -#pragma once - -#include -#include - -#include "FfmpegHeaders.h" -#include "FfmpegStream.h" -#include "Interface.h" - -#define VIO_BUFFER_SZ 81920 -#define AVPROBE_SIZE 8192 - -class DecoderParameters { - public: - std::unordered_map formats; - // av_seek_frame is imprecise so seek to a timestamp earlier by a margin - // The unit of margin is second - double seekFrameMargin{1.0}; - // When getPtsOnly is set to 1, we only get pts of each frame and don not - // output frame data. It will be much faster - int64_t getPtsOnly{0}; -}; - -class FfmpegAvioContext { - public: - FfmpegAvioContext(); - - int initAVIOContext(const uint8_t* buffer, int64_t size); - - ~FfmpegAvioContext(); - - int read(uint8_t* buf, int buf_size); - - static int readMemory(void* opaque, uint8_t* buf, int buf_size); - - int64_t seek(int64_t offset, int whence); - - static int64_t seekMemory(void* opaque, int64_t offset, int whence); - - AVIOContext* get_avio() { - return avioCtx_; - } - - private: - int workBuffersize_; - uint8_t* workBuffer_; - // for file mode - FILE* inputFile_; - // for memory mode - const uint8_t* inputBuffer_; - int inputBufferSize_; - int offset_ = 0; - - AVIOContext* avioCtx_{nullptr}; -}; - -class FfmpegDecoder { - public: - FfmpegDecoder() { - av_register_all(); - } - ~FfmpegDecoder() { - cleanUp(); - } - // return 0 on success - // return negative number on failure - int decodeFile( - std::unique_ptr params, - const std::string& filename, - DecoderOutput& decoderOutput); - // return 0 on success - // return negative number on failure - int decodeMemory( - std::unique_ptr params, - const uint8_t* buffer, - int64_t size, - DecoderOutput& decoderOutput); - // return 0 on success - // return negative number on failure - int probeFile( - std::unique_ptr params, - const std::string& filename, - DecoderOutput& decoderOutput); - // return 0 on success - // return negative number on failure - int probeMemory( - std::unique_ptr params, - const uint8_t* buffer, - int64_t size, - DecoderOutput& decoderOutput); - - void cleanUp(); - - private: - FfmpegStream* findStreamByIndex(int streamIndex) const; - - int init( - const std::string& filename, - bool isDecodeFile, - FfmpegAvioContext& ioctx, - DecoderOutput& decoderOutput); - // return 0 on success - // return negative number on failure - int decodeLoop( - std::unique_ptr params, - const std::string& filename, - bool isDecodeFile, - FfmpegAvioContext& ioctx, - DecoderOutput& decoderOutput); - - int probeVideo( - std::unique_ptr params, - const std::string& filename, - bool isDecodeFile, - FfmpegAvioContext& ioctx, - DecoderOutput& decoderOutput); - - bool initStreams(); - - void flushStreams(DecoderOutput& decoderOutput); - // whether in all streams, the pts of most recent frame exceeds range - bool isPtsExceedRange(); - - std::unordered_map> streams_; - AVFormatContext* formatCtx_{nullptr}; - std::unique_ptr params_{nullptr}; -}; diff --git a/torchvision/csrc/cpu/video_reader/FfmpegHeaders.h b/torchvision/csrc/cpu/video_reader/FfmpegHeaders.h deleted file mode 100644 index ff26aa30a8d..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegHeaders.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -extern "C" { -#include -#include -#include -#include -#include -#include -#include -#include -#include -} diff --git a/torchvision/csrc/cpu/video_reader/FfmpegSampler.h b/torchvision/csrc/cpu/video_reader/FfmpegSampler.h deleted file mode 100644 index 3d00be3486f..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegSampler.h +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once - -#include "FfmpegHeaders.h" -#include "Interface.h" - -/** - * Class sample data from AVFrame - */ -class FfmpegSampler { - public: - virtual ~FfmpegSampler() = default; - // return 0 on success and negative number on failure - virtual int init() = 0; - // sample from the given frame - virtual std::unique_ptr sample(const AVFrame* frame) = 0; -}; diff --git a/torchvision/csrc/cpu/video_reader/FfmpegStream.cpp b/torchvision/csrc/cpu/video_reader/FfmpegStream.cpp deleted file mode 100644 index b745170baf4..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegStream.cpp +++ /dev/null @@ -1,188 +0,0 @@ -#include "FfmpegStream.h" -#include "FfmpegUtil.h" - -using namespace std; - -// (TODO) Currently, disable the use of refCount -static int refCount = 0; - -FfmpegStream::FfmpegStream( - AVFormatContext* inputCtx, - int index, - enum AVMediaType avMediaType, - double seekFrameMargin) - : inputCtx_(inputCtx), - index_(index), - avMediaType_(avMediaType), - seekFrameMargin_(seekFrameMargin) {} - -FfmpegStream::~FfmpegStream() { - if (frame_) { - av_frame_free(&frame_); - } - avcodec_free_context(&codecCtx_); -} - -int FfmpegStream::openCodecContext() { - VLOG(2) << "stream start_time: " << inputCtx_->streams[index_]->start_time; - - auto typeString = av_get_media_type_string(avMediaType_); - AVStream* st = inputCtx_->streams[index_]; - auto codec_id = st->codecpar->codec_id; - VLOG(1) << "codec_id: " << codec_id; - AVCodec* codec = avcodec_find_decoder(codec_id); - if (!codec) { - LOG(ERROR) << "avcodec_find_decoder failed for codec_id: " << int(codec_id); - return AVERROR(EINVAL); - } - VLOG(1) << "Succeed to find decoder"; - - codecCtx_ = avcodec_alloc_context3(codec); - if (!codecCtx_) { - LOG(ERROR) << "avcodec_alloc_context3 fails"; - return AVERROR(ENOMEM); - } - - int ret; - /* Copy codec parameters from input stream to output codec context */ - if ((ret = avcodec_parameters_to_context(codecCtx_, st->codecpar)) < 0) { - LOG(ERROR) << "Failed to copy " << typeString - << " codec parameters to decoder context"; - return ret; - } - - AVDictionary* opts = nullptr; - av_dict_set(&opts, "refcounted_frames", refCount ? "1" : "0", 0); - - // after avcodec_open2, value of codecCtx_->time_base is NOT meaningful - // But inputCtx_->streams[index_]->time_base has meaningful values - if ((ret = avcodec_open2(codecCtx_, codec, &opts)) < 0) { - LOG(ERROR) << "avcodec_open2 failed. " << ffmpeg_util::getErrorDesc(ret); - return ret; - } - VLOG(1) << "Succeed to open codec"; - - frame_ = av_frame_alloc(); - return initFormat(); -} - -unique_ptr FfmpegStream::getFrameData(int getPtsOnly) { - if (!codecCtx_) { - LOG(ERROR) << "Codec is not initialized"; - return nullptr; - } - if (getPtsOnly) { - unique_ptr decodedFrame = make_unique(); - decodedFrame->pts_ = frame_->pts; - return decodedFrame; - } else { - unique_ptr decodedFrame = sampleFrameData(); - if (decodedFrame) { - decodedFrame->pts_ = frame_->pts; - } - return decodedFrame; - } -} - -void FfmpegStream::flush(int getPtsOnly, DecoderOutput& decoderOutput) { - VLOG(1) << "Media Type: " << getMediaType() << ", flush stream."; - // need to receive frames before entering draining mode - receiveAvailFrames(getPtsOnly, decoderOutput); - - VLOG(2) << "send nullptr packet"; - sendPacket(nullptr); - // receive remaining frames after entering draining mode - receiveAvailFrames(getPtsOnly, decoderOutput); - - avcodec_flush_buffers(codecCtx_); -} - -bool FfmpegStream::isFramePtsInRange() { - CHECK(frame_); - auto pts = frame_->pts; - auto startPts = this->getStartPts(); - auto endPts = this->getEndPts(); - VLOG(2) << "isPtsInRange. pts: " << pts << ", startPts: " << startPts - << ", endPts: " << endPts; - return (pts == AV_NOPTS_VALUE) || - (pts >= startPts && (endPts >= 0 ? pts <= endPts : true)); -} - -bool FfmpegStream::isFramePtsExceedRange() { - if (frame_) { - auto endPts = this->getEndPts(); - VLOG(2) << "isFramePtsExceedRange. last_pts_: " << last_pts_ - << ", endPts: " << endPts; - return endPts >= 0 ? last_pts_ >= endPts : false; - } else { - return true; - } -} - -// seek a frame -int FfmpegStream::seekFrame(int64_t seekPts) { - // translate margin from second to pts - int64_t margin = (int64_t)( - seekFrameMargin_ * (double)inputCtx_->streams[index_]->time_base.den / - (double)inputCtx_->streams[index_]->time_base.num); - int64_t real_seekPts = (seekPts - margin) > 0 ? (seekPts - margin) : 0; - VLOG(2) << "seek margin: " << margin; - VLOG(2) << "real seekPts: " << real_seekPts; - int ret = av_seek_frame( - inputCtx_, - index_, - (seekPts - margin) > 0 ? (seekPts - margin) : 0, - AVSEEK_FLAG_BACKWARD); - if (ret < 0) { - LOG(WARNING) << "av_seek_frame fails. Stream index: " << index_; - return ret; - } - return 0; -} - -// send/receive encoding and decoding API overview -// https://ffmpeg.org/doxygen/3.4/group__lavc__encdec.html -int FfmpegStream::sendPacket(const AVPacket* packet) { - return avcodec_send_packet(codecCtx_, packet); -} - -int FfmpegStream::receiveFrame() { - int ret = avcodec_receive_frame(codecCtx_, frame_); - if (ret >= 0) { - // succeed - frame_->pts = av_frame_get_best_effort_timestamp(frame_); - if (frame_->pts == AV_NOPTS_VALUE) { - // Trick: if we can not figure out pts, we just set it to be (last_pts + - // 1) - frame_->pts = last_pts_ + 1; - } - last_pts_ = frame_->pts; - - VLOG(2) << "avcodec_receive_frame succeed"; - } else if (ret == AVERROR(EAGAIN)) { - VLOG(2) << "avcodec_receive_frame fails and returns AVERROR(EAGAIN). "; - } else if (ret == AVERROR_EOF) { - // no more frame to read - VLOG(2) << "avcodec_receive_frame returns AVERROR_EOF"; - } else { - LOG(WARNING) << "avcodec_receive_frame failed. Error: " - << ffmpeg_util::getErrorDesc(ret); - } - return ret; -} - -void FfmpegStream::receiveAvailFrames( - int getPtsOnly, - DecoderOutput& decoderOutput) { - int result = 0; - while ((result = receiveFrame()) >= 0) { - unique_ptr decodedFrame = getFrameData(getPtsOnly); - - if (decodedFrame && - ((!getPtsOnly && decodedFrame->frameSize_ > 0) || getPtsOnly)) { - if (isFramePtsInRange()) { - decoderOutput.addMediaFrame(getMediaType(), std::move(decodedFrame)); - } - } // end-if - } // end-while -} diff --git a/torchvision/csrc/cpu/video_reader/FfmpegStream.h b/torchvision/csrc/cpu/video_reader/FfmpegStream.h deleted file mode 100644 index b66a36977ec..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegStream.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -#pragma once - -#include -#include -#include -#include "FfmpegHeaders.h" -#include "Interface.h" - -/* -Class uses FFMPEG library to decode one media stream (audio or video). -*/ -class FfmpegStream { - public: - FfmpegStream( - AVFormatContext* inputCtx, - int index, - enum AVMediaType avMediaType, - double seekFrameMargin); - virtual ~FfmpegStream(); - - // returns 0 - on success or negative error - int openCodecContext(); - // returns stream index - int getIndex() const { - return index_; - } - // returns number decoded/sampled bytes - std::unique_ptr getFrameData(int getPtsOnly); - // flush the stream at the end of decoding. - // Return 0 on success and -1 when cache is drained - void flush(int getPtsOnly, DecoderOutput& decoderOutput); - // seek a frame - int seekFrame(int64_t ts); - // send an AVPacket - int sendPacket(const AVPacket* packet); - // receive AVFrame - int receiveFrame(); - // receive all available frames from the internal buffer - void receiveAvailFrames(int getPtsOnly, DecoderOutput& decoderOutput); - // return media type - virtual MediaType getMediaType() const = 0; - // return media format - virtual FormatUnion getMediaFormat() const = 0; - // return start presentation timestamp - virtual int64_t getStartPts() const = 0; - // return end presentation timestamp - virtual int64_t getEndPts() const = 0; - // is the pts of most recent frame within range? - bool isFramePtsInRange(); - // does the pts of most recent frame exceed range? - bool isFramePtsExceedRange(); - - protected: - virtual int initFormat() = 0; - // returns a decoded frame - virtual std::unique_ptr sampleFrameData() = 0; - - protected: - AVFormatContext* const inputCtx_; - const int index_; - enum AVMediaType avMediaType_; - - AVCodecContext* codecCtx_{nullptr}; - AVFrame* frame_{nullptr}; - // pts of last decoded frame - int64_t last_pts_{0}; - double seekFrameMargin_{1.0}; -}; diff --git a/torchvision/csrc/cpu/video_reader/FfmpegUtil.cpp b/torchvision/csrc/cpu/video_reader/FfmpegUtil.cpp deleted file mode 100644 index 9e804ee67c0..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegUtil.cpp +++ /dev/null @@ -1,111 +0,0 @@ -#include "FfmpegUtil.h" - -using namespace std; - -namespace ffmpeg_util { - -bool mapFfmpegType(AVMediaType media, MediaType* type) { - switch (media) { - case AVMEDIA_TYPE_VIDEO: - *type = MediaType::TYPE_VIDEO; - return true; - case AVMEDIA_TYPE_AUDIO: - *type = MediaType::TYPE_AUDIO; - return true; - default: - return false; - } -} - -bool mapMediaType(MediaType type, AVMediaType* media) { - switch (type) { - case MediaType::TYPE_VIDEO: - *media = AVMEDIA_TYPE_VIDEO; - return true; - case MediaType::TYPE_AUDIO: - *media = AVMEDIA_TYPE_AUDIO; - return true; - default: - return false; - } -} - -void setFormatDimensions( - int& destW, - int& destH, - int userW, - int userH, - int srcW, - int srcH, - int minDimension) { - // rounding rules - // int -> double -> round - // round up if fraction is >= 0.5 or round down if fraction is < 0.5 - // int result = double(value) + 0.5 - // here we rounding double to int according to the above rule - if (userW == 0 && userH == 0) { - if (minDimension > 0) { // #2 - if (srcW > srcH) { - // landscape - destH = minDimension; - destW = round(double(srcW * minDimension) / srcH); - } else { - // portrait - destW = minDimension; - destH = round(double(srcH * minDimension) / srcW); - } - } else { // #1 - destW = srcW; - destH = srcH; - } - } else if (userW != 0 && userH == 0) { // #3 - destW = userW; - destH = round(double(srcH * userW) / srcW); - } else if (userW == 0 && userH != 0) { // #4 - destW = round(double(srcW * userH) / srcH); - destH = userH; - } else { - // userW != 0 && userH != 0. #5 - destW = userW; - destH = userH; - } - // prevent zeros - destW = std::max(destW, 1); - destH = std::max(destH, 1); -} - -bool validateVideoFormat(const VideoFormat& f) { - /* - Valid parameters values for decoder - ___________________________________________________ - | W | H | minDimension | algorithm | - |_________________________________________________| - | 0 | 0 | 0 | original | - |_________________________________________________| - | 0 | 0 | >0 |scale to min dimension| - |_____|_____|____________________________________ | - | >0 | 0 | 0 | scale keeping W | - |_________________________________________________| - | 0 | >0 | 0 | scale keeping H | - |_________________________________________________| - | >0 | >0 | 0 | stretch/scale | - |_________________________________________________| - - */ - return (f.width == 0 && f.height == 0) || // #1 and #2 - (f.width != 0 && f.height != 0 && f.minDimension == 0) || // # 5 - (((f.width != 0 && f.height == 0) || // #3 and #4 - (f.width == 0 && f.height != 0)) && - f.minDimension == 0); -} - -string getErrorDesc(int errnum) { - array buffer; - if (av_strerror(errnum, buffer.data(), buffer.size()) < 0) { - return string("Unknown error code"); - } - buffer.back() = 0; - return string(buffer.data()); -} - -} // namespace ffmpeg_util diff --git a/torchvision/csrc/cpu/video_reader/FfmpegUtil.h b/torchvision/csrc/cpu/video_reader/FfmpegUtil.h deleted file mode 100644 index 9f42eb53c97..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegUtil.h +++ /dev/null @@ -1,27 +0,0 @@ -#pragma once - -#include -#include -#include "FfmpegHeaders.h" -#include "Interface.h" - -namespace ffmpeg_util { - -bool mapFfmpegType(AVMediaType media, enum MediaType* type); - -bool mapMediaType(MediaType type, enum AVMediaType* media); - -void setFormatDimensions( - int& destW, - int& destH, - int userW, - int userH, - int srcW, - int srcH, - int minDimension); - -bool validateVideoFormat(const VideoFormat& f); - -std::string getErrorDesc(int errnum); - -} // namespace ffmpeg_util diff --git a/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.cpp b/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.cpp deleted file mode 100644 index d87b3104dd5..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.cpp +++ /dev/null @@ -1,90 +0,0 @@ -#include "FfmpegVideoSampler.h" -#include "FfmpegUtil.h" - -using namespace std; - -FfmpegVideoSampler::FfmpegVideoSampler( - const VideoFormat& in, - const VideoFormat& out, - int swsFlags) - : inFormat_(in), outFormat_(out), swsFlags_(swsFlags) {} - -FfmpegVideoSampler::~FfmpegVideoSampler() { - if (scaleContext_) { - sws_freeContext(scaleContext_); - scaleContext_ = nullptr; - } -} - -int FfmpegVideoSampler::init() { - VLOG(1) << "Input format: width " << inFormat_.width << ", height " - << inFormat_.height << ", format " << inFormat_.format - << ", minDimension " << inFormat_.minDimension; - VLOG(1) << "Scale format: width " << outFormat_.width << ", height " - << outFormat_.height << ", format " << outFormat_.format - << ", minDimension " << outFormat_.minDimension; - - scaleContext_ = sws_getContext( - inFormat_.width, - inFormat_.height, - (AVPixelFormat)inFormat_.format, - outFormat_.width, - outFormat_.height, - static_cast(outFormat_.format), - swsFlags_, - nullptr, - nullptr, - nullptr); - if (scaleContext_) { - return 0; - } else { - return -1; - } -} - -int32_t FfmpegVideoSampler::getImageBytes() const { - return av_image_get_buffer_size( - (AVPixelFormat)outFormat_.format, outFormat_.width, outFormat_.height, 1); -} - -// https://ffmpeg.org/doxygen/3.4/scaling_video_8c-example.html#a10 -unique_ptr FfmpegVideoSampler::sample(const AVFrame* frame) { - if (!frame) { - return nullptr; // no flush for videos - } - // scaled and cropped image - auto outImageSize = getImageBytes(); - AvDataPtr frameData(static_cast(av_malloc(outImageSize))); - - uint8_t* scalePlanes[4] = {nullptr}; - int scaleLines[4] = {0}; - - int result; - if ((result = av_image_fill_arrays( - scalePlanes, - scaleLines, - frameData.get(), - static_cast(outFormat_.format), - outFormat_.width, - outFormat_.height, - 1)) < 0) { - LOG(ERROR) << "av_image_fill_arrays failed, err: " - << ffmpeg_util::getErrorDesc(result); - return nullptr; - } - - if ((result = sws_scale( - scaleContext_, - frame->data, - frame->linesize, - 0, - inFormat_.height, - scalePlanes, - scaleLines)) < 0) { - LOG(ERROR) << "sws_scale failed, err: " - << ffmpeg_util::getErrorDesc(result); - return nullptr; - } - - return make_unique(std::move(frameData), outImageSize, 0); -} diff --git a/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.h b/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.h deleted file mode 100644 index 1fd6862f537..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.h +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once - -#include "FfmpegSampler.h" - -/** - * Class transcode video frames from one format into another - */ - -class FfmpegVideoSampler : public FfmpegSampler { - public: - explicit FfmpegVideoSampler( - const VideoFormat& in, - const VideoFormat& out, - int swsFlags = SWS_AREA); - ~FfmpegVideoSampler() override; - - int init() override; - - int32_t getImageBytes() const; - // returns number of bytes of the sampled data - std::unique_ptr sample(const AVFrame* frame) override; - - const VideoFormat& getInFormat() const { - return inFormat_; - } - - private: - VideoFormat inFormat_; - VideoFormat outFormat_; - int swsFlags_; - SwsContext* scaleContext_{nullptr}; -}; diff --git a/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.cpp b/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.cpp deleted file mode 100644 index 7a429249a71..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.cpp +++ /dev/null @@ -1,115 +0,0 @@ -#include "FfmpegVideoStream.h" -#include "FfmpegUtil.h" - -using namespace std; - -namespace { - -bool operator==(const VideoFormat& x, const AVFrame& y) { - return x.width == y.width && x.height == y.height && - x.format == static_cast(y.format); -} - -VideoFormat toVideoFormat(const AVFrame& frame) { - VideoFormat videoFormat; - videoFormat.width = frame.width; - videoFormat.height = frame.height; - videoFormat.format = static_cast(frame.format); - - return videoFormat; -} - -} // namespace - -FfmpegVideoStream::FfmpegVideoStream( - AVFormatContext* inputCtx, - int index, - enum AVMediaType avMediaType, - MediaFormat mediaFormat, - double seekFrameMargin) - : FfmpegStream(inputCtx, index, avMediaType, seekFrameMargin), - mediaFormat_(mediaFormat) {} - -FfmpegVideoStream::~FfmpegVideoStream() {} - -void FfmpegVideoStream::checkStreamDecodeParams() { - auto timeBase = getTimeBase(); - if (timeBase.first > 0) { - CHECK_EQ(timeBase.first, inputCtx_->streams[index_]->time_base.num); - CHECK_EQ(timeBase.second, inputCtx_->streams[index_]->time_base.den); - } -} - -void FfmpegVideoStream::updateStreamDecodeParams() { - auto timeBase = getTimeBase(); - if (timeBase.first == 0) { - mediaFormat_.format.video.timeBaseNum = - inputCtx_->streams[index_]->time_base.num; - mediaFormat_.format.video.timeBaseDen = - inputCtx_->streams[index_]->time_base.den; - } - mediaFormat_.format.video.duration = inputCtx_->streams[index_]->duration; -} - -int FfmpegVideoStream::initFormat() { - // set output format - VideoFormat& format = mediaFormat_.format.video; - if (!ffmpeg_util::validateVideoFormat(format)) { - LOG(ERROR) << "Invalid video format"; - return -1; - } - - format.fps = av_q2d( - av_guess_frame_rate(inputCtx_, inputCtx_->streams[index_], nullptr)); - - // keep aspect ratio - ffmpeg_util::setFormatDimensions( - format.width, - format.height, - format.width, - format.height, - codecCtx_->width, - codecCtx_->height, - format.minDimension); - - VLOG(1) << "After adjusting, video format" - << ", width: " << format.width << ", height: " << format.height - << ", format: " << format.format - << ", minDimension: " << format.minDimension; - - if (format.format == AV_PIX_FMT_NONE) { - format.format = codecCtx_->pix_fmt; - VLOG(1) << "Set pixel format: " << format.format; - } - - checkStreamDecodeParams(); - - updateStreamDecodeParams(); - - return format.width != 0 && format.height != 0 && - format.format != AV_PIX_FMT_NONE - ? 0 - : -1; -} - -unique_ptr FfmpegVideoStream::sampleFrameData() { - VideoFormat& format = mediaFormat_.format.video; - if (!sampler_ || !(sampler_->getInFormat() == *frame_)) { - VideoFormat newInFormat = toVideoFormat(*frame_); - sampler_ = make_unique(newInFormat, format, SWS_AREA); - VLOG(1) << "Set input video sampler format" - << ", width: " << newInFormat.width - << ", height: " << newInFormat.height - << ", format: " << newInFormat.format - << " : output video sampler format" - << ", width: " << format.width << ", height: " << format.height - << ", format: " << format.format - << ", minDimension: " << format.minDimension; - int ret = sampler_->init(); - if (ret < 0) { - VLOG(1) << "Fail to initialize video sampler"; - return nullptr; - } - } - return sampler_->sample(frame_); -} diff --git a/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.h b/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.h deleted file mode 100644 index 9bfbc9f665b..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.h +++ /dev/null @@ -1,54 +0,0 @@ -#pragma once - -#include -#include "FfmpegStream.h" -#include "FfmpegVideoSampler.h" - -/** - * Class uses FFMPEG library to decode one video stream. - */ -class FfmpegVideoStream : public FfmpegStream { - public: - explicit FfmpegVideoStream( - AVFormatContext* inputCtx, - int index, - enum AVMediaType avMediaType, - MediaFormat mediaFormat, - double seekFrameMargin); - - ~FfmpegVideoStream() override; - - // FfmpegStream overrides - MediaType getMediaType() const override { - return MediaType::TYPE_VIDEO; - } - - FormatUnion getMediaFormat() const override { - return mediaFormat_.format; - } - - int64_t getStartPts() const override { - return mediaFormat_.format.video.startPts; - } - int64_t getEndPts() const override { - return mediaFormat_.format.video.endPts; - } - // return numerator and denominator of time base - std::pair getTimeBase() const { - return std::make_pair( - mediaFormat_.format.video.timeBaseNum, - mediaFormat_.format.video.timeBaseDen); - } - - void checkStreamDecodeParams(); - - void updateStreamDecodeParams(); - - protected: - int initFormat() override; - std::unique_ptr sampleFrameData() override; - - private: - MediaFormat mediaFormat_; - std::unique_ptr sampler_{nullptr}; -}; diff --git a/torchvision/csrc/cpu/video_reader/Interface.cpp b/torchvision/csrc/cpu/video_reader/Interface.cpp deleted file mode 100644 index 0ec9f155821..00000000000 --- a/torchvision/csrc/cpu/video_reader/Interface.cpp +++ /dev/null @@ -1,22 +0,0 @@ -#include "Interface.h" - -void DecoderOutput::initMediaType(MediaType mediaType, FormatUnion format) { - MediaData mediaData(format); - media_data_.emplace(mediaType, std::move(mediaData)); -} - -void DecoderOutput::addMediaFrame( - MediaType mediaType, - std::unique_ptr frame) { - if (media_data_.find(mediaType) != media_data_.end()) { - VLOG(1) << "media type: " << mediaType - << " add frame with pts: " << frame->pts_; - media_data_[mediaType].frames_.push_back(std::move(frame)); - } else { - VLOG(1) << "media type: " << mediaType << " not found. Skip the frame."; - } -} - -void DecoderOutput::clear() { - media_data_.clear(); -} diff --git a/torchvision/csrc/cpu/video_reader/Interface.h b/torchvision/csrc/cpu/video_reader/Interface.h deleted file mode 100644 index e137008ce7b..00000000000 --- a/torchvision/csrc/cpu/video_reader/Interface.h +++ /dev/null @@ -1,127 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -extern "C" { - -#include -#include -void av_free(void* ptr); -} - -struct avDeleter { - void operator()(uint8_t* p) const { - av_free(p); - } -}; - -const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24; -const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT; - -using AvDataPtr = std::unique_ptr; - -enum MediaType : uint32_t { - TYPE_VIDEO = 1, - TYPE_AUDIO = 2, -}; - -struct EnumClassHash { - template - uint32_t operator()(T t) const { - return static_cast(t); - } -}; - -struct VideoFormat { - // fields are initialized for the auto detection - // caller can specify some/all of field values if specific output is desirable - - int width{0}; // width in pixels - int height{0}; // height in pixels - int minDimension{0}; // choose min dimension and rescale accordingly - // Output image pixel format. data type AVPixelFormat - AVPixelFormat format{defaultVideoPixelFormat}; // type AVPixelFormat - int64_t startPts{0}, endPts{0}; // Start and end presentation timestamp - int timeBaseNum{0}; - int timeBaseDen{1}; // numerator and denominator of time base - float fps{0.0}; - int64_t duration{0}; // duration of the stream, in stream time base -}; - -struct AudioFormat { - // fields are initialized for the auto detection - // caller can specify some/all of field values if specific output is desirable - - int samples{0}; // number samples per second (frequency) - int channels{0}; // number of channels - AVSampleFormat format{defaultAudioSampleFormat}; // type AVSampleFormat - int64_t startPts{0}, endPts{0}; // Start and end presentation timestamp - int timeBaseNum{0}; - int timeBaseDen{1}; // numerator and denominator of time base - int64_t duration{0}; // duration of the stream, in stream time base -}; - -union FormatUnion { - FormatUnion() {} - VideoFormat video; - AudioFormat audio; -}; - -struct MediaFormat { - MediaFormat() {} - - MediaFormat(const MediaFormat& mediaFormat) : type(mediaFormat.type) { - if (type == MediaType::TYPE_VIDEO) { - format.video = mediaFormat.format.video; - } else if (type == MediaType::TYPE_AUDIO) { - format.audio = mediaFormat.format.audio; - } - } - - MediaFormat(MediaType mediaType) : type(mediaType) { - if (mediaType == MediaType::TYPE_VIDEO) { - format.video = VideoFormat(); - } else if (mediaType == MediaType::TYPE_AUDIO) { - format.audio = AudioFormat(); - } - } - // media type - MediaType type; - // format data - FormatUnion format; -}; - -class DecodedFrame { - public: - explicit DecodedFrame() : frame_(nullptr), frameSize_(0), pts_(0) {} - explicit DecodedFrame(AvDataPtr frame, int frameSize, int64_t pts) - : frame_(std::move(frame)), frameSize_(frameSize), pts_(pts) {} - AvDataPtr frame_{nullptr}; - int frameSize_{0}; - int64_t pts_{0}; -}; - -struct MediaData { - MediaData() {} - MediaData(FormatUnion format) : format_(format) {} - FormatUnion format_; - std::vector> frames_; -}; - -class DecoderOutput { - public: - explicit DecoderOutput() {} - - ~DecoderOutput() {} - - void initMediaType(MediaType mediaType, FormatUnion format); - - void addMediaFrame(MediaType mediaType, std::unique_ptr frame); - - void clear(); - - std::unordered_map media_data_; -}; diff --git a/torchvision/csrc/cpu/video_reader/VideoReader.cpp b/torchvision/csrc/cpu/video_reader/VideoReader.cpp index dfe7f46bf39..7578927f1b5 100644 --- a/torchvision/csrc/cpu/video_reader/VideoReader.cpp +++ b/torchvision/csrc/cpu/video_reader/VideoReader.cpp @@ -3,11 +3,11 @@ #include #include #include -#include "FfmpegDecoder.h" -#include "FfmpegHeaders.h" -#include "util.h" +#include "memory_buffer.h" +#include "sync_decoder.h" using namespace std; +using namespace ffmpeg; // If we are in a Windows environment, we need to define // initialization functions for the _custom_ops extension @@ -27,121 +27,157 @@ PyMODINIT_FUNC PyInit_video_reader(void) { namespace video_reader { -class UnknownPixelFormatException : public exception { - const char* what() const throw() override { - return "Unknown pixel format"; - } -}; - -int getChannels(AVPixelFormat format) { - int numChannels = 0; - switch (format) { - case AV_PIX_FMT_BGR24: - case AV_PIX_FMT_RGB24: - numChannels = 3; - break; - default: - LOG(ERROR) << "Unknown format: " << format; - throw UnknownPixelFormatException(); - } - return numChannels; -} +const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24; +const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT; +const size_t decoderTimeoutMs = 600000; +// A jitter can be added to the end of the range to avoid conversion/rounding +// error, small value 100us won't be enough to select the next frame, but enough +// to compensate rounding error due to the multiple conversions. +const size_t timeBaseJitterUs = 100; + +DecoderParameters getDecoderParams( + int64_t videoStartUs, + int64_t videoEndUs, + double seekFrameMarginUs, + int64_t getPtsOnly, + int64_t readVideoStream, + int videoWidth, + int videoHeight, + int videoMinDimension, + int64_t readAudioStream, + int audioSamples, + int audioChannels) { + DecoderParameters params; + params.headerOnly = getPtsOnly != 0; + params.seekAccuracy = seekFrameMarginUs; + params.startOffset = videoStartUs; + params.endOffset = videoEndUs; + params.timeoutMs = decoderTimeoutMs; + params.preventStaleness = false; -void fillVideoTensor( - std::vector>& frames, - torch::Tensor& videoFrame, - torch::Tensor& videoFramePts) { - int frameSize = 0; - if (videoFrame.numel() > 0) { - frameSize = videoFrame.numel() / frames.size(); + if (readVideoStream == 1) { + MediaFormat videoFormat(0); + videoFormat.type = TYPE_VIDEO; + videoFormat.format.video.format = defaultVideoPixelFormat; + videoFormat.format.video.width = videoWidth; + videoFormat.format.video.height = videoHeight; + videoFormat.format.video.minDimension = videoMinDimension; + params.formats.insert(videoFormat); } - int frameCount = 0; + if (readAudioStream == 1) { + MediaFormat audioFormat; + audioFormat.type = TYPE_AUDIO; + audioFormat.format.audio.format = defaultAudioSampleFormat; + audioFormat.format.audio.samples = audioSamples; + audioFormat.format.audio.channels = audioChannels; + params.formats.insert(audioFormat); + } - uint8_t* videoFrameData = - videoFrame.numel() > 0 ? videoFrame.data_ptr() : nullptr; - int64_t* videoFramePtsData = videoFramePts.data_ptr(); + return params; +} - for (size_t i = 0; i < frames.size(); ++i) { - const auto& frame = frames[i]; - if (videoFrameData) { - memcpy( - videoFrameData + (size_t)(frameCount++) * (size_t)frameSize, - frame->frame_.get(), - frameSize * sizeof(uint8_t)); +// returns number of written bytes +template +size_t fillTensor( + std::vector& msgs, + torch::Tensor& frame, + torch::Tensor& framePts, + int64_t num, + int64_t den) { + if (msgs.empty()) { + return 0; + } + T* frameData = frame.numel() > 0 ? frame.data_ptr() : nullptr; + int64_t* framePtsData = framePts.data_ptr(); + CHECK_EQ(framePts.size(0), msgs.size()); + size_t avgElementsInFrame = frame.numel() / msgs.size(); + + size_t offset = 0; + for (size_t i = 0; i < msgs.size(); ++i) { + const auto& msg = msgs[i]; + // convert pts into original time_base + AVRational avr = {(int)num, (int)den}; + framePtsData[i] = av_rescale_q(msg.header.pts, AV_TIME_BASE_Q, avr); + VLOG(2) << "PTS type: " << sizeof(T) << ", us: " << msg.header.pts + << ", original: " << framePtsData[i]; + + if (frameData) { + auto sizeInBytes = msg.payload->length(); + memcpy(frameData + offset, msg.payload->data(), sizeInBytes); + if (sizeof(T) == sizeof(uint8_t)) { + // Video - move by allocated frame size + offset += avgElementsInFrame / sizeof(T); + } else { + // Audio - move by number of samples + offset += sizeInBytes / sizeof(T); + } } - videoFramePtsData[i] = frame->pts_; } + return offset * sizeof(T); } -void getVideoMeta( - DecoderOutput& decoderOutput, - int& numFrames, - int& height, - int& width, - int& numChannels) { - auto& videoFrames = decoderOutput.media_data_[TYPE_VIDEO].frames_; - numFrames = videoFrames.size(); - - FormatUnion& videoFormat = decoderOutput.media_data_[TYPE_VIDEO].format_; - height = videoFormat.video.height; - width = videoFormat.video.width; - numChannels = getChannels(videoFormat.video.format); +size_t fillVideoTensor( + std::vector& msgs, + torch::Tensor& videoFrame, + torch::Tensor& videoFramePts, + int64_t num, + int64_t den) { + return fillTensor(msgs, videoFrame, videoFramePts, num, den); } -void fillAudioTensor( - std::vector>& frames, +size_t fillAudioTensor( + std::vector& msgs, torch::Tensor& audioFrame, - torch::Tensor& audioFramePts) { - if (frames.size() == 0) { - return; - } - - float* audioFrameData = - audioFrame.numel() > 0 ? audioFrame.data_ptr() : nullptr; - CHECK_EQ(audioFramePts.size(0), frames.size()); - int64_t* audioFramePtsData = audioFramePts.data_ptr(); - - int bytesPerSample = av_get_bytes_per_sample(defaultAudioSampleFormat); - - int64_t frameDataOffset = 0; - for (size_t i = 0; i < frames.size(); ++i) { - audioFramePtsData[i] = frames[i]->pts_; - if (audioFrameData) { - memcpy( - audioFrameData + frameDataOffset, - frames[i]->frame_.get(), - frames[i]->frameSize_); - frameDataOffset += (frames[i]->frameSize_ / bytesPerSample); - } - } + torch::Tensor& audioFramePts, + int64_t num, + int64_t den) { + return fillTensor(msgs, audioFrame, audioFramePts, num, den); } -void getAudioMeta( - DecoderOutput& decoderOutput, - int64_t& numSamples, - int64_t& channels, - int64_t& numFrames) { - FormatUnion& audioFormat = decoderOutput.media_data_[TYPE_AUDIO].format_; - - channels = audioFormat.audio.channels; - CHECK_EQ(audioFormat.audio.format, AV_SAMPLE_FMT_FLT); - int bytesPerSample = av_get_bytes_per_sample( - static_cast(audioFormat.audio.format)); - - // auto& audioFrames = decoderOutput.media_frames_[TYPE_AUDIO]; - auto& audioFrames = decoderOutput.media_data_[TYPE_AUDIO].frames_; - numFrames = audioFrames.size(); - int64_t frameSizeTotal = 0; - for (auto const& decodedFrame : audioFrames) { - frameSizeTotal += static_cast(decodedFrame->frameSize_); +void offsetsToUs( + double& seekFrameMargin, + int64_t readVideoStream, + int64_t videoStartPts, + int64_t videoEndPts, + int64_t videoTimeBaseNum, + int64_t videoTimeBaseDen, + int64_t readAudioStream, + int64_t audioStartPts, + int64_t audioEndPts, + int64_t audioTimeBaseNum, + int64_t audioTimeBaseDen, + int64_t& videoStartUs, + int64_t& videoEndUs) { + seekFrameMargin *= AV_TIME_BASE; + videoStartUs = 0; + videoEndUs = -1; + + if (readVideoStream) { + AVRational vr = {(int)videoTimeBaseNum, (int)videoTimeBaseDen}; + if (videoStartPts > 0) { + videoStartUs = av_rescale_q(videoStartPts, vr, AV_TIME_BASE_Q); + } + if (videoEndPts > 0) { + // Add jitter to the end of the range to avoid conversion/rounding error. + // Small value 100us won't be enough to select the next frame, but enough + // to compensate rounding error due to the multiple conversions. + videoEndUs = + timeBaseJitterUs + av_rescale_q(videoEndPts, vr, AV_TIME_BASE_Q); + } + } else if (readAudioStream) { + AVRational ar = {(int)audioTimeBaseNum, (int)audioTimeBaseDen}; + if (audioStartPts > 0) { + videoStartUs = av_rescale_q(audioStartPts, ar, AV_TIME_BASE_Q); + } + if (audioEndPts > 0) { + // Add jitter to the end of the range to avoid conversion/rounding error. + // Small value 100us won't be enough to select the next frame, but enough + // to compensate rounding error due to the multiple conversions. + videoEndUs = + timeBaseJitterUs + av_rescale_q(audioEndPts, ar, AV_TIME_BASE_Q); + } } - VLOG(2) << "numFrames: " << numFrames; - VLOG(2) << "frameSizeTotal: " << frameSizeTotal; - VLOG(2) << "channels: " << channels; - VLOG(2) << "bytesPerSample: " << bytesPerSample; - CHECK_EQ(frameSizeTotal % (channels * bytesPerSample), 0); - numSamples = frameSizeTotal / (channels * bytesPerSample); } torch::List readVideo( @@ -165,38 +201,83 @@ torch::List readVideo( int64_t audioEndPts, int64_t audioTimeBaseNum, int64_t audioTimeBaseDen) { - unique_ptr params = util::getDecoderParams( + int64_t videoStartUs, videoEndUs; + + offsetsToUs( seekFrameMargin, - getPtsOnly, readVideoStream, - width, - height, - minDimension, videoStartPts, videoEndPts, videoTimeBaseNum, videoTimeBaseDen, readAudioStream, - audioSamples, - audioChannels, audioStartPts, audioEndPts, audioTimeBaseNum, - audioTimeBaseDen); - - FfmpegDecoder decoder; - DecoderOutput decoderOutput; + audioTimeBaseDen, + videoStartUs, + videoEndUs); + + DecoderParameters params = getDecoderParams( + videoStartUs, // videoStartPts + videoEndUs, // videoEndPts + seekFrameMargin, // seekFrameMargin + getPtsOnly, // getPtsOnly + readVideoStream, // readVideoStream + width, // width + height, // height + minDimension, // minDimension + readAudioStream, // readAudioStream + audioSamples, // audioSamples + audioChannels // audioChannels + ); + SyncDecoder decoder; + std::vector audioMessages, videoMessages; + DecoderInCallback callback = nullptr; + std::string logMessage, logType; if (isReadFile) { - decoder.decodeFile(std::move(params), videoPath, decoderOutput); + params.uri = videoPath; + logType = "file"; + logMessage = videoPath; } else { - decoder.decodeMemory( - std::move(params), - input_video.data_ptr(), - input_video.size(0), - decoderOutput); + callback = MemoryBuffer::getCallback( + input_video.data_ptr(), input_video.size(0)); + logType = "memory"; + logMessage = std::to_string(input_video.size(0)); } + VLOG(1) << "Video decoding from " << logType << " [" << logMessage + << "] has started"; + + const auto now = std::chrono::system_clock::now(); + + bool succeeded; + if ((succeeded = decoder.init(params, std::move(callback)))) { + int res; + DecoderOutputMessage msg; + while (0 == (res = decoder.decode(&msg, decoderTimeoutMs))) { + if (msg.header.format.type == TYPE_VIDEO) { + videoMessages.push_back(std::move(msg)); + } + if (msg.header.format.type == TYPE_AUDIO) { + audioMessages.push_back(std::move(msg)); + } + msg.payload.reset(); + } + + const auto then = std::chrono::system_clock::now(); + VLOG(1) << "Video decoding from " << logType << " [" << logMessage + << "] has finished, " + << std::chrono::duration_cast(then - now) + .count() + << " us"; + } else { + LOG(ERROR) << "Decoder initialization has failed"; + } + + decoder.shutdown(); + // video section torch::Tensor videoFrame = torch::zeros({0}, torch::kByte); torch::Tensor videoFramePts = torch::zeros({0}, torch::kLong); @@ -204,37 +285,50 @@ torch::List readVideo( torch::Tensor videoFps = torch::zeros({0}, torch::kFloat); torch::Tensor videoDuration = torch::zeros({0}, torch::kLong); - if (readVideoStream == 1) { - auto it = decoderOutput.media_data_.find(TYPE_VIDEO); - if (it != decoderOutput.media_data_.end()) { - int numVideoFrames, outHeight, outWidth, numChannels; - getVideoMeta( - decoderOutput, numVideoFrames, outHeight, outWidth, numChannels); - + if (succeeded && readVideoStream == 1) { + if (!videoMessages.empty()) { + const auto& header = videoMessages[0].header; + const auto& media = header.format; + const auto& format = media.format.video; + int numVideoFrames = videoMessages.size(); + int outHeight = format.height; + int outWidth = format.width; + int numChannels = 3; // decoder guarantees the default AV_PIX_FMT_RGB24 + + size_t expectedWrittenBytes = 0; if (getPtsOnly == 0) { videoFrame = torch::zeros( {numVideoFrames, outHeight, outWidth, numChannels}, torch::kByte); + expectedWrittenBytes = + numVideoFrames * outHeight * outWidth * numChannels; } videoFramePts = torch::zeros({numVideoFrames}, torch::kLong); - fillVideoTensor( - decoderOutput.media_data_[TYPE_VIDEO].frames_, - videoFrame, - videoFramePts); + VLOG(2) << "video duration: " << media.duration << ", fps: " << header.fps + << ", num: " << media.num << ", den: " << media.den + << ", num frames: " << numVideoFrames; + + auto numberWrittenBytes = fillVideoTensor( + videoMessages, videoFrame, videoFramePts, media.num, media.den); + + CHECK_EQ(numberWrittenBytes, expectedWrittenBytes); videoTimeBase = torch::zeros({2}, torch::kInt); int* videoTimeBaseData = videoTimeBase.data_ptr(); - videoTimeBaseData[0] = it->second.format_.video.timeBaseNum; - videoTimeBaseData[1] = it->second.format_.video.timeBaseDen; + videoTimeBaseData[0] = media.num; + videoTimeBaseData[1] = media.den; videoFps = torch::zeros({1}, torch::kFloat); float* videoFpsData = videoFps.data_ptr(); - videoFpsData[0] = it->second.format_.video.fps; + videoFpsData[0] = header.fps; videoDuration = torch::zeros({1}, torch::kLong); int64_t* videoDurationData = videoDuration.data_ptr(); - videoDurationData[0] = it->second.format_.video.duration; + AVRational avr = {(int)media.num, (int)media.den}; + videoDurationData[0] = av_rescale_q(media.duration, AV_TIME_BASE_Q, avr); + VLOG(1) << "Video decoding from " << logType << " [" << logMessage + << "] filled video tensors"; } else { VLOG(1) << "Miss video stream"; } @@ -246,39 +340,58 @@ torch::List readVideo( torch::Tensor audioTimeBase = torch::zeros({0}, torch::kInt); torch::Tensor audioSampleRate = torch::zeros({0}, torch::kInt); torch::Tensor audioDuration = torch::zeros({0}, torch::kLong); - if (readAudioStream == 1) { - auto it = decoderOutput.media_data_.find(TYPE_AUDIO); - if (it != decoderOutput.media_data_.end()) { - VLOG(1) << "Find audio stream"; - int64_t numAudioSamples = 0, outAudioChannels = 0, numAudioFrames = 0; - getAudioMeta( - decoderOutput, numAudioSamples, outAudioChannels, numAudioFrames); - VLOG(2) << "numAudioSamples: " << numAudioSamples; - VLOG(2) << "outAudioChannels: " << outAudioChannels; - VLOG(2) << "numAudioFrames: " << numAudioFrames; - + if (succeeded && readAudioStream == 1) { + if (!audioMessages.empty()) { + const auto& header = audioMessages[0].header; + const auto& media = header.format; + const auto& format = media.format.audio; + + int64_t outAudioChannels = format.channels; + int bytesPerSample = + av_get_bytes_per_sample(static_cast(format.format)); + + int numAudioFrames = audioMessages.size(); + int64_t numAudioSamples = 0; if (getPtsOnly == 0) { + int64_t frameSizeTotal = 0; + for (auto const& audioMessage : audioMessages) { + frameSizeTotal += audioMessage.payload->length(); + } + + CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0); + numAudioSamples = frameSizeTotal / (outAudioChannels * bytesPerSample); + audioFrame = torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat); } audioFramePts = torch::zeros({numAudioFrames}, torch::kLong); - fillAudioTensor( - decoderOutput.media_data_[TYPE_AUDIO].frames_, - audioFrame, - audioFramePts); + + VLOG(2) << "audio duration: " << media.duration + << ", channels: " << format.channels + << ", sample rate: " << format.samples << ", num: " << media.num + << ", den: " << media.den; + + auto numberWrittenBytes = fillAudioTensor( + audioMessages, audioFrame, audioFramePts, media.num, media.den); + CHECK_EQ( + numberWrittenBytes, + numAudioSamples * outAudioChannels * sizeof(float)); audioTimeBase = torch::zeros({2}, torch::kInt); int* audioTimeBaseData = audioTimeBase.data_ptr(); - audioTimeBaseData[0] = it->second.format_.audio.timeBaseNum; - audioTimeBaseData[1] = it->second.format_.audio.timeBaseDen; + audioTimeBaseData[0] = media.num; + audioTimeBaseData[1] = media.den; audioSampleRate = torch::zeros({1}, torch::kInt); int* audioSampleRateData = audioSampleRate.data_ptr(); - audioSampleRateData[0] = it->second.format_.audio.samples; + audioSampleRateData[0] = format.samples; audioDuration = torch::zeros({1}, torch::kLong); int64_t* audioDurationData = audioDuration.data_ptr(); - audioDurationData[0] = it->second.format_.audio.duration; + AVRational avr = {(int)media.num, (int)media.den}; + audioDurationData[0] = av_rescale_q(media.duration, AV_TIME_BASE_Q, avr); + VLOG(1) << "Video decoding from " << logType << " [" << logMessage + << "] filled audio tensors"; } else { VLOG(1) << "Miss audio stream"; } @@ -296,6 +409,9 @@ torch::List readVideo( result.push_back(std::move(audioSampleRate)); result.push_back(std::move(audioDuration)); + VLOG(1) << "Video decoding from " << logType << " [" << logMessage + << "] about to return"; + return result; } @@ -388,59 +504,101 @@ torch::List probeVideo( bool isReadFile, const torch::Tensor& input_video, std::string videoPath) { - unique_ptr params = util::getDecoderParams( + DecoderParameters params = getDecoderParams( + 0, // videoStartUs + -1, // videoEndUs 0, // seekFrameMargin - 0, // getPtsOnly + 1, // getPtsOnly 1, // readVideoStream 0, // width 0, // height 0, // minDimension - 0, // videoStartPts - 0, // videoEndPts - 0, // videoTimeBaseNum - 1, // videoTimeBaseDen 1, // readAudioStream 0, // audioSamples - 0, // audioChannels - 0, // audioStartPts - 0, // audioEndPts - 0, // audioTimeBaseNum - 1 // audioTimeBaseDen + 0 // audioChannels ); - FfmpegDecoder decoder; - DecoderOutput decoderOutput; + SyncDecoder decoder; + DecoderOutputMessage audioMessage, videoMessage; + DecoderInCallback callback = nullptr; + std::string logMessage, logType; if (isReadFile) { - decoder.probeFile(std::move(params), videoPath, decoderOutput); + params.uri = videoPath; + logType = "file"; + logMessage = videoPath; + } else { + callback = MemoryBuffer::getCallback( + input_video.data_ptr(), input_video.size(0)); + logType = "memory"; + logMessage = std::to_string(input_video.size(0)); + } + + VLOG(1) << "Video probing from " << logType << " [" << logMessage + << "] has started"; + + const auto now = std::chrono::system_clock::now(); + + bool succeeded; + bool gotAudio = false, gotVideo = false; + if ((succeeded = decoder.init(params, std::move(callback)))) { + int res; + DecoderOutputMessage msg; + while (0 == (res = decoder.decode(&msg, decoderTimeoutMs)) && + (!gotAudio || !gotVideo)) { + if (msg.header.format.type == TYPE_VIDEO && !gotVideo) { + videoMessage = std::move(msg); + gotVideo = true; + } + if (msg.header.format.type == TYPE_AUDIO && !gotAudio) { + audioMessage = std::move(msg); + gotAudio = true; + } + msg.payload.reset(); + } + succeeded = (res == 0 || res == ENODATA); + + const auto then = std::chrono::system_clock::now(); + VLOG(1) << "Video probing from " << logType << " [" << logMessage + << "] has finished, " + << std::chrono::duration_cast(then - now) + .count() + << " us"; } else { - decoder.probeMemory( - std::move(params), - input_video.data_ptr(), - input_video.size(0), - decoderOutput); + LOG(ERROR) << "Decoder initialization has failed"; } + + decoder.shutdown(); + // video section torch::Tensor videoTimeBase = torch::zeros({0}, torch::kInt); torch::Tensor videoFps = torch::zeros({0}, torch::kFloat); torch::Tensor videoDuration = torch::zeros({0}, torch::kLong); - auto it = decoderOutput.media_data_.find(TYPE_VIDEO); - if (it != decoderOutput.media_data_.end()) { - VLOG(1) << "Find video stream"; + if (succeeded && gotVideo) { videoTimeBase = torch::zeros({2}, torch::kInt); int* videoTimeBaseData = videoTimeBase.data_ptr(); - videoTimeBaseData[0] = it->second.format_.video.timeBaseNum; - videoTimeBaseData[1] = it->second.format_.video.timeBaseDen; + const auto& header = videoMessage.header; + const auto& media = header.format; + + videoTimeBaseData[0] = media.num; + videoTimeBaseData[1] = media.den; videoFps = torch::zeros({1}, torch::kFloat); float* videoFpsData = videoFps.data_ptr(); - videoFpsData[0] = it->second.format_.video.fps; + videoFpsData[0] = header.fps; videoDuration = torch::zeros({1}, torch::kLong); int64_t* videoDurationData = videoDuration.data_ptr(); - videoDurationData[0] = it->second.format_.video.duration; + AVRational avr = {(int)media.num, (int)media.den}; + videoDurationData[0] = av_rescale_q(media.duration, AV_TIME_BASE_Q, avr); + + VLOG(2) << "Prob fps: " << header.fps << ", duration: " << media.duration + << ", num: " << media.num << ", den: " << media.den; + + VLOG(1) << "Video probing from " << logType << " [" << logMessage + << "] filled video tensors"; } else { - VLOG(1) << "Miss video stream"; + LOG(ERROR) << "Miss video stream"; } // audio section @@ -448,21 +606,31 @@ torch::List probeVideo( torch::Tensor audioSampleRate = torch::zeros({0}, torch::kInt); torch::Tensor audioDuration = torch::zeros({0}, torch::kLong); - it = decoderOutput.media_data_.find(TYPE_AUDIO); - if (it != decoderOutput.media_data_.end()) { - VLOG(1) << "Find audio stream"; + if (succeeded && gotAudio) { audioTimeBase = torch::zeros({2}, torch::kInt); int* audioTimeBaseData = audioTimeBase.data_ptr(); - audioTimeBaseData[0] = it->second.format_.audio.timeBaseNum; - audioTimeBaseData[1] = it->second.format_.audio.timeBaseDen; + const auto& header = audioMessage.header; + const auto& media = header.format; + const auto& format = media.format.audio; + + audioTimeBaseData[0] = media.num; + audioTimeBaseData[1] = media.den; audioSampleRate = torch::zeros({1}, torch::kInt); int* audioSampleRateData = audioSampleRate.data_ptr(); - audioSampleRateData[0] = it->second.format_.audio.samples; + audioSampleRateData[0] = format.samples; audioDuration = torch::zeros({1}, torch::kLong); int64_t* audioDurationData = audioDuration.data_ptr(); - audioDurationData[0] = it->second.format_.audio.duration; + AVRational avr = {(int)media.num, (int)media.den}; + audioDurationData[0] = av_rescale_q(media.duration, AV_TIME_BASE_Q, avr); + + VLOG(2) << "Prob sample rate: " << format.samples + << ", duration: " << media.duration << ", num: " << media.num + << ", den: " << media.den; + + VLOG(1) << "Video probing from " << logType << " [" << logMessage + << "] filled audio tensors"; } else { VLOG(1) << "Miss audio stream"; } @@ -475,6 +643,9 @@ torch::List probeVideo( result.push_back(std::move(audioSampleRate)); result.push_back(std::move(audioDuration)); + VLOG(1) << "Video probing from " << logType << " [" << logMessage + << "] is about to return"; + return result; } diff --git a/torchvision/csrc/cpu/video_reader/VideoReader.h b/torchvision/csrc/cpu/video_reader/VideoReader.h index efc2e4709a6..923a3190977 100644 --- a/torchvision/csrc/cpu/video_reader/VideoReader.h +++ b/torchvision/csrc/cpu/video_reader/VideoReader.h @@ -1,99 +1,3 @@ #pragma once #include - -// Interface for Python - -/* - return: - videoFrame: tensor (N, H, W, C) kByte - videoFramePts: tensor (N) kLong - videoTimeBase: tensor (2) kInt - videoFps: tensor (1) kFloat - audioFrame: tensor (N, C) kFloat - audioFramePts: tensor (N) kLong - audioTimeBase: tensor (2) kInt - audioSampleRate: tensor (1) kInt -*/ -torch::List readVideoFromMemory( - // 1D tensor of data type uint8, storing the comparessed video data - torch::Tensor input_video, - // seeking frame in the video/audio stream is imprecise so seek to a - // timestamp earlier by a margin The unit of margin is second - double seekFrameMargin, - // If only pts is needed and video/audio frames are not needed, set it - // to 1 - int64_t getPtsOnly, - // bool variable. Set it to 1 if video stream should be read. Otherwise, set - // it to 0 - int64_t readVideoStream, - /* - Valid parameters values for rescaling video frames - ___________________________________________________ - | width | height | min_dimension | algorithm | - |_________________________________________________| - | 0 | 0 | 0 | original | - |_________________________________________________| - | 0 | 0 | >0 |scale to min dimension| - |_____|_____|____________________________________ | - | >0 | 0 | 0 | scale keeping W | - |_________________________________________________| - | 0 | >0 | 0 | scale keeping H | - |_________________________________________________| - | >0 | >0 | 0 | stretch/scale | - |_________________________________________________| - */ - int64_t width, - int64_t height, - int64_t minDimension, - // video frames with pts in [videoStartPts, videoEndPts] will be decoded - // For decoding all video frames, use [0, -1] - int64_t videoStartPts, - int64_t videoEndPts, - // numerator and denominator of time base of video stream. - // For decoding all video frames, supply dummy 0 (numerator) and 1 - // (denominator). For decoding localized video frames, need to supply - // them which will be checked during decoding - int64_t videoTimeBaseNum, - int64_t videoTimeBaseDen, - // bool variable. Set it to 1 if audio stream should be read. Otherwise, set - // it to 0 - int64_t readAudioStream, - // audio stream sampling rate. - // If not resampling audio waveform, supply 0 - // Otherwise, supply a positive integer. - int64_t audioSamples, - // audio stream channels - // Supply 0 to use the same number of channels as in the original audio - // stream - int64_t audioChannels, - // audio frames with pts in [audioStartPts, audioEndPts] will be decoded - // For decoding all audio frames, use [0, -1] - int64_t audioStartPts, - int64_t audioEndPts, - // numerator and denominator of time base of audio stream. - // For decoding all audio frames, supply dummy 0 (numerator) and 1 - // (denominator). For decoding localized audio frames, need to supply - // them which will be checked during decoding - int64_t audioTimeBaseNum, - int64_t audioTimeBaseDen); - -torch::List readVideoFromFile( - std::string videoPath, - double seekFrameMargin, - int64_t getPtsOnly, - int64_t readVideoStream, - int64_t width, - int64_t height, - int64_t minDimension, - int64_t videoStartPts, - int64_t videoEndPts, - int64_t videoTimeBaseNum, - int64_t videoTimeBaseDen, - int64_t readAudioStream, - int64_t audioSamples, - int64_t audioChannels, - int64_t audioStartPts, - int64_t audioEndPts, - int64_t audioTimeBaseNum, - int64_t audioTimeBaseDen); diff --git a/torchvision/csrc/cpu/video_reader/util.cpp b/torchvision/csrc/cpu/video_reader/util.cpp deleted file mode 100644 index ae3c3df0f0a..00000000000 --- a/torchvision/csrc/cpu/video_reader/util.cpp +++ /dev/null @@ -1,60 +0,0 @@ -#include "util.h" - -using namespace std; - -namespace util { - -unique_ptr getDecoderParams( - double seekFrameMargin, - int64_t getPtsOnly, - int64_t readVideoStream, - int videoWidth, - int videoHeight, - int videoMinDimension, - int64_t videoStartPts, - int64_t videoEndPts, - int videoTimeBaseNum, - int videoTimeBaseDen, - int64_t readAudioStream, - int audioSamples, - int audioChannels, - int64_t audioStartPts, - int64_t audioEndPts, - int audioTimeBaseNum, - int audioTimeBaseDen) { - unique_ptr params = make_unique(); - - if (readVideoStream == 1) { - params->formats.emplace( - MediaType::TYPE_VIDEO, MediaFormat(MediaType::TYPE_VIDEO)); - MediaFormat& videoFormat = params->formats[MediaType::TYPE_VIDEO]; - - videoFormat.format.video.width = videoWidth; - videoFormat.format.video.height = videoHeight; - videoFormat.format.video.minDimension = videoMinDimension; - videoFormat.format.video.startPts = videoStartPts; - videoFormat.format.video.endPts = videoEndPts; - videoFormat.format.video.timeBaseNum = videoTimeBaseNum; - videoFormat.format.video.timeBaseDen = videoTimeBaseDen; - } - - if (readAudioStream == 1) { - params->formats.emplace( - MediaType::TYPE_AUDIO, MediaFormat(MediaType::TYPE_AUDIO)); - MediaFormat& audioFormat = params->formats[MediaType::TYPE_AUDIO]; - - audioFormat.format.audio.samples = audioSamples; - audioFormat.format.audio.channels = audioChannels; - audioFormat.format.audio.startPts = audioStartPts; - audioFormat.format.audio.endPts = audioEndPts; - audioFormat.format.audio.timeBaseNum = audioTimeBaseNum; - audioFormat.format.audio.timeBaseDen = audioTimeBaseDen; - } - - params->seekFrameMargin = seekFrameMargin; - params->getPtsOnly = getPtsOnly; - - return params; -} - -} // namespace util diff --git a/torchvision/csrc/cpu/video_reader/util.h b/torchvision/csrc/cpu/video_reader/util.h deleted file mode 100644 index 6b5fd55388b..00000000000 --- a/torchvision/csrc/cpu/video_reader/util.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once -#include -#include "FfmpegDecoder.h" - -namespace util { - -std::unique_ptr getDecoderParams( - double seekFrameMargin, - int64_t getPtsOnly, - int64_t readVideoStream, - int videoWidth, - int videoHeight, - int videoMinDimension, - int64_t videoStartPts, - int64_t videoEndPts, - int videoTimeBaseNum, - int videoTimeBaseDen, - int64_t readAudioStream, - int audioSamples, - int audioChannels, - int64_t audioStartPts, - int64_t audioEndPts, - int audioTimeBaseNum, - int audioTimeBaseDen); - -} // namespace util