From 8c7006fba5ea133e4c564d48ff0aa602f22dc034 Mon Sep 17 00:00:00 2001 From: Yuri Putivsky Date: Tue, 28 Jan 2020 11:09:09 -0800 Subject: [PATCH] Integrated base decoder into VideoReader class and video_utils.py (#1766) Summary: Pull Request resolved: https://github.com/pytorch/vision/pull/1766 Replaced FfmpegDecoder (incompativle with VUE) by base decoder (compatible with VUE). Modified python utilities video_utils.py for internal simplification. Public interface got preserved. Differential Revision: D19415903 fbshipit-source-id: fcd4a7a6453c1468e578441a55ce424b72fe6778 --- setup.py | 32 +- .../csrc/cpu/decoder/audio_sampler.cpp | 2 - torchvision/csrc/cpu/decoder/audio_sampler.h | 2 - torchvision/csrc/cpu/decoder/audio_stream.cpp | 50 +- torchvision/csrc/cpu/decoder/audio_stream.h | 5 - torchvision/csrc/cpu/decoder/cc_stream.cpp | 2 - torchvision/csrc/cpu/decoder/cc_stream.h | 2 - torchvision/csrc/cpu/decoder/decoder.cpp | 146 ++++-- torchvision/csrc/cpu/decoder/decoder.h | 13 +- torchvision/csrc/cpu/decoder/defs.h | 48 +- .../csrc/cpu/decoder/memory_buffer.cpp | 78 +++ torchvision/csrc/cpu/decoder/memory_buffer.h | 25 + .../csrc/cpu/decoder/seekable_buffer.cpp | 108 ++-- .../csrc/cpu/decoder/seekable_buffer.h | 22 +- torchvision/csrc/cpu/decoder/stream.cpp | 114 +++- torchvision/csrc/cpu/decoder/stream.h | 15 +- .../csrc/cpu/decoder/subtitle_sampler.cpp | 2 - .../csrc/cpu/decoder/subtitle_sampler.h | 2 - .../csrc/cpu/decoder/subtitle_stream.cpp | 19 +- .../csrc/cpu/decoder/subtitle_stream.h | 6 +- torchvision/csrc/cpu/decoder/sync_decoder.cpp | 15 +- torchvision/csrc/cpu/decoder/sync_decoder.h | 2 - .../csrc/cpu/decoder/sync_decoder_test.cpp | 53 +- torchvision/csrc/cpu/decoder/time_keeper.cpp | 2 - torchvision/csrc/cpu/decoder/time_keeper.h | 2 - torchvision/csrc/cpu/decoder/util.cpp | 2 - torchvision/csrc/cpu/decoder/util.h | 2 - .../csrc/cpu/decoder/video_sampler.cpp | 2 - torchvision/csrc/cpu/decoder/video_sampler.h | 2 - torchvision/csrc/cpu/decoder/video_stream.cpp | 51 +- torchvision/csrc/cpu/decoder/video_stream.h | 6 +- .../cpu/video_reader/FfmpegAudioSampler.cpp | 118 ----- .../cpu/video_reader/FfmpegAudioSampler.h | 32 -- .../cpu/video_reader/FfmpegAudioStream.cpp | 103 ---- .../csrc/cpu/video_reader/FfmpegAudioStream.h | 54 -- .../csrc/cpu/video_reader/FfmpegDecoder.cpp | 412 --------------- .../csrc/cpu/video_reader/FfmpegDecoder.h | 127 ----- .../csrc/cpu/video_reader/FfmpegHeaders.h | 13 - .../csrc/cpu/video_reader/FfmpegSampler.h | 16 - .../csrc/cpu/video_reader/FfmpegStream.cpp | 188 ------- .../csrc/cpu/video_reader/FfmpegStream.h | 69 --- .../csrc/cpu/video_reader/FfmpegUtil.cpp | 111 ---- .../csrc/cpu/video_reader/FfmpegUtil.h | 27 - .../cpu/video_reader/FfmpegVideoSampler.cpp | 90 ---- .../cpu/video_reader/FfmpegVideoSampler.h | 32 -- .../cpu/video_reader/FfmpegVideoStream.cpp | 115 ----- .../csrc/cpu/video_reader/FfmpegVideoStream.h | 54 -- .../csrc/cpu/video_reader/Interface.cpp | 22 - torchvision/csrc/cpu/video_reader/Interface.h | 127 ----- .../csrc/cpu/video_reader/VideoReader.cpp | 487 +++++++++++------- torchvision/csrc/cpu/video_reader/util.cpp | 60 --- torchvision/csrc/cpu/video_reader/util.h | 26 - 52 files changed, 770 insertions(+), 2345 deletions(-) create mode 100644 torchvision/csrc/cpu/decoder/memory_buffer.cpp create mode 100644 torchvision/csrc/cpu/decoder/memory_buffer.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegAudioStream.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegAudioStream.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegDecoder.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegDecoder.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegHeaders.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegSampler.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegStream.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegStream.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegUtil.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegUtil.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.h delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegVideoStream.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/FfmpegVideoStream.h delete mode 100644 torchvision/csrc/cpu/video_reader/Interface.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/Interface.h delete mode 100644 torchvision/csrc/cpu/video_reader/util.cpp delete mode 100644 torchvision/csrc/cpu/video_reader/util.h diff --git a/setup.py b/setup.py index 60b8a12c91b..f763bd2c42c 100644 --- a/setup.py +++ b/setup.py @@ -155,41 +155,21 @@ def get_extensions(): ffmpeg_root = os.path.dirname(ffmpeg_bin) ffmpeg_include_dir = os.path.join(ffmpeg_root, 'include') - # TorchVision video reader + # TorchVision base decoder + video reader video_reader_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video_reader') video_reader_src = glob.glob(os.path.join(video_reader_src_dir, "*.cpp")) - - ext_modules.append( - CppExtension( - 'torchvision.video_reader', - video_reader_src, - include_dirs=[ - video_reader_src_dir, - ffmpeg_include_dir, - extensions_dir, - ], - libraries=[ - 'avcodec', - 'avformat', - 'avutil', - 'swresample', - 'swscale', - ], - extra_compile_args=["-std=c++14"], - extra_link_args=["-std=c++14"], - ) - ) - - # TorchVision base decoder base_decoder_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'decoder') base_decoder_src = glob.glob(os.path.join(base_decoder_src_dir, "[!sync_decoder_test]*.cpp")) + combined_src = video_reader_src + base_decoder_src + ext_modules.append( CppExtension( - 'torchvision.base_decoder', - base_decoder_src, + 'torchvision.video_reader', + combined_src, include_dirs=[ base_decoder_src_dir, + video_reader_src_dir, ffmpeg_include_dir, extensions_dir, ], diff --git a/torchvision/csrc/cpu/decoder/audio_sampler.cpp b/torchvision/csrc/cpu/decoder/audio_sampler.cpp index c10fceb852d..514ac63f3e3 100644 --- a/torchvision/csrc/cpu/decoder/audio_sampler.cpp +++ b/torchvision/csrc/cpu/decoder/audio_sampler.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "audio_sampler.h" #include #include "util.h" diff --git a/torchvision/csrc/cpu/decoder/audio_sampler.h b/torchvision/csrc/cpu/decoder/audio_sampler.h index d68a21ea20e..c6a021d2084 100644 --- a/torchvision/csrc/cpu/decoder/audio_sampler.h +++ b/torchvision/csrc/cpu/decoder/audio_sampler.h @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "defs.h" diff --git a/torchvision/csrc/cpu/decoder/audio_stream.cpp b/torchvision/csrc/cpu/decoder/audio_stream.cpp index 17ab9fceb7b..5fd08ccf35a 100644 --- a/torchvision/csrc/cpu/decoder/audio_stream.cpp +++ b/torchvision/csrc/cpu/decoder/audio_stream.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "audio_stream.h" #include #include @@ -8,11 +6,23 @@ namespace ffmpeg { namespace { +bool operator==(const AudioFormat& x, const AVFrame& y) { + return x.samples == y.sample_rate && x.channels == y.channels && + x.format == y.format; +} + bool operator==(const AudioFormat& x, const AVCodecContext& y) { return x.samples == y.sample_rate && x.channels == y.channels && x.format == y.sample_fmt; } +AudioFormat& toAudioFormat(AudioFormat& x, const AVFrame& y) { + x.samples = y.sample_rate; + x.channels = y.channels; + x.format = y.format; + return x; +} + AudioFormat& toAudioFormat(AudioFormat& x, const AVCodecContext& y) { x.samples = y.sample_rate; x.channels = y.channels; @@ -65,12 +75,15 @@ int AudioStream::initFormat() { int AudioStream::estimateBytes(bool flush) { ensureSampler(); - if (!(sampler_->getInputFormat().audio == *codecCtx_)) { + // check if input format gets changed + if (flush ? !(sampler_->getInputFormat().audio == *codecCtx_) + : !(sampler_->getInputFormat().audio == *frame_)) { // - reinit sampler SamplerParameters params; params.type = format_.type; params.out = format_.format; - toAudioFormat(params.in.audio, *codecCtx_); + flush ? toAudioFormat(params.in.audio, *codecCtx_) + : toAudioFormat(params.in.audio, *frame_); if (flush || !sampler_->init(params)) { return -1; } @@ -84,7 +97,7 @@ int AudioStream::estimateBytes(bool flush) { << ", channels: " << format_.format.audio.channels << ", format: " << format_.format.audio.format; } - return sampler_->getSamplesBytes(frame_); + return sampler_->getSamplesBytes(flush ? nullptr : frame_); } int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) { @@ -92,31 +105,4 @@ int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) { return sampler_->sample(flush ? nullptr : frame_, out); } -void AudioStream::setHeader(DecoderHeader* header) { - header->seqno = numGenerator_++; - - if (codecCtx_->time_base.num != 0) { - header->pts = av_rescale_q( - av_frame_get_best_effort_timestamp(frame_), - codecCtx_->time_base, - AV_TIME_BASE_Q); - } else { - // If the codec time_base is missing then we would've skipped the - // rescalePackage step to rescale to codec time_base, so here we can - // rescale straight from the stream time_base into AV_TIME_BASE_Q. - header->pts = av_rescale_q( - av_frame_get_best_effort_timestamp(frame_), - inputCtx_->streams[format_.stream]->time_base, - AV_TIME_BASE_Q); - } - - if (convertPtsToWallTime_) { - keeper_.adjust(header->pts); - } - - header->keyFrame = 1; - header->fps = std::numeric_limits::quiet_NaN(); - header->format = format_; -} - } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/audio_stream.h b/torchvision/csrc/cpu/decoder/audio_stream.h index c7708a3356d..4d200114e4a 100644 --- a/torchvision/csrc/cpu/decoder/audio_stream.h +++ b/torchvision/csrc/cpu/decoder/audio_stream.h @@ -1,10 +1,7 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "audio_sampler.h" #include "stream.h" -#include "time_keeper.h" namespace ffmpeg { @@ -25,13 +22,11 @@ class AudioStream : public Stream { int initFormat() override; int estimateBytes(bool flush) override; int copyFrameBytes(ByteStorage* out, bool flush) override; - void setHeader(DecoderHeader* header) override; void ensureSampler(); private: std::unique_ptr sampler_; - TimeKeeper keeper_; }; } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/cc_stream.cpp b/torchvision/csrc/cpu/decoder/cc_stream.cpp index 47de485b100..7b443146289 100644 --- a/torchvision/csrc/cpu/decoder/cc_stream.cpp +++ b/torchvision/csrc/cpu/decoder/cc_stream.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "cc_stream.h" namespace ffmpeg { diff --git a/torchvision/csrc/cpu/decoder/cc_stream.h b/torchvision/csrc/cpu/decoder/cc_stream.h index 34506d3259f..d8c98f7be23 100644 --- a/torchvision/csrc/cpu/decoder/cc_stream.h +++ b/torchvision/csrc/cpu/decoder/cc_stream.h @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "subtitle_stream.h" diff --git a/torchvision/csrc/cpu/decoder/decoder.cpp b/torchvision/csrc/cpu/decoder/decoder.cpp index d8f324863e4..692eb4494ff 100644 --- a/torchvision/csrc/cpu/decoder/decoder.cpp +++ b/torchvision/csrc/cpu/decoder/decoder.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "decoder.h" #include #include @@ -15,8 +13,6 @@ namespace ffmpeg { namespace { -constexpr ssize_t kMinSeekBufferSize = 1024; -constexpr ssize_t kMaxSeekBufferSize = 4 * 1024; constexpr size_t kIoBufferSize = 4 * 1024; constexpr size_t kLogBufferSize = 1024; @@ -205,7 +201,7 @@ void Decoder::initOnce() { av_lockmgr_register(&ffmpeg_lock); av_log_set_callback(Decoder::logFunction); av_log_set_level(AV_LOG_ERROR); - LOG(INFO) << "Registered ffmpeg libs"; + VLOG(1) << "Registered ffmpeg libs"; }); } @@ -248,23 +244,21 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) { return false; } - bool canSeek = in(nullptr, 0, 0) == 0; + bool canSeek = in(nullptr, 0, 0, 0) == 0; - if (!seekableBuffer_.init( - std::forward(in), - kMinSeekBufferSize, - kMaxSeekBufferSize, - params_.timeoutMs)) { - LOG(ERROR) << "seekable buffer initialization failed"; - av_free(avioCtxBuffer); - avformat_close_input(&tmpCtx); - cleanUp(); - return false; - } + seekableBuffer_.init(std::forward(in)); if (params_.isImage) { + ImageType type = ImageType::UNKNOWN; + if (!seekableBuffer_.detect(params_.timeoutMs, &type)) { + LOG(ERROR) << "can't detect image type"; + av_free(avioCtxBuffer); + avformat_close_input(&tmpCtx); + cleanUp(); + return false; + } const char* fmtName = "image2"; - switch (seekableBuffer_.getImageType()) { + switch (type) { case ImageType::JPEG: fmtName = "jpeg_pipe"; break; @@ -299,19 +293,6 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) { tmpCtx->pb = avioCtx_; } - interrupted_ = false; - // ffmpeg avformat_open_input call can hang if media source doesn't respond - // set a guard for handle such situations - std::promise p; - std::future f = p.get_future(); - std::thread guard([&f, this]() { - auto timeout = std::chrono::milliseconds(params_.timeoutMs); - if (std::future_status::timeout == f.wait_for(timeout)) { - LOG(ERROR) << "Cannot open stream within " << params_.timeoutMs << " ms"; - interrupted_ = true; - } - }); - tmpCtx->opaque = reinterpret_cast(this); tmpCtx->interrupt_callback.callback = Decoder::shutdownFunction; tmpCtx->interrupt_callback.opaque = reinterpret_cast(this); @@ -326,6 +307,23 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) { av_dict_set_int(&options, "listen", 1, 0); } + interrupted_ = false; + // ffmpeg avformat_open_input call can hang if media source doesn't respond + // set a guard for handle such situations, if requested + std::promise p; + std::future f = p.get_future(); + std::unique_ptr guard; + if (params_.preventStaleness) { + guard = std::make_unique([&f, this]() { + auto timeout = std::chrono::milliseconds(params_.timeoutMs); + if (std::future_status::timeout == f.wait_for(timeout)) { + LOG(ERROR) << "Cannot open stream within " << params_.timeoutMs + << " ms"; + interrupted_ = true; + } + }); + } + int result = 0; if (fmt) { result = avformat_open_input(&tmpCtx, nullptr, fmt, &options); @@ -335,8 +333,11 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) { } av_dict_free(&options); - p.set_value(true); - guard.join(); + if (guard) { + p.set_value(true); + guard->join(); + guard.reset(); + } inputCtx_ = tmpCtx; @@ -364,16 +365,15 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) { onInit(); - if (params.startOffsetMs != 0) { - av_seek_frame( - inputCtx_, - -1, - params.startOffsetMs * AV_TIME_BASE / 1000, - AVSEEK_FLAG_FRAME | AVSEEK_FLAG_ANY); + if (params.startOffset != 0) { + auto offset = params.startOffset <= params.seekAccuracy + ? 0 + : params.startOffset - params.seekAccuracy; + + av_seek_frame(inputCtx_, -1, offset, AVSEEK_FLAG_BACKWARD); } - LOG(INFO) << "Decoder initialized, log level: " << params_.logLevel; - outOfRange_ = false; + VLOG(1) << "Decoder initialized, log level: " << params_.logLevel; return true; } @@ -418,6 +418,7 @@ bool Decoder::activateStreams() { return false; } streams_.emplace(i, std::move(stream)); + inRange_.set(i, true); } } @@ -458,8 +459,8 @@ void Decoder::cleanUp() { seekableBuffer_.shutdown(); } -int Decoder::getBytes(size_t workingTimeInMs) { - if (outOfRange_) { +int Decoder::getFrame(size_t workingTimeInMs) { + if (inRange_.none()) { return ENODATA; } // decode frames until cache is full and leave thread @@ -478,14 +479,16 @@ int Decoder::getBytes(size_t workingTimeInMs) { return std::chrono::steady_clock::now() <= end; }; - int result = ETIMEDOUT; + int result = 0; size_t decodingErrors = 0; - while (!interrupted_ && watcher()) { + bool decodedFrame = false; + while (!interrupted_ && inRange_.any() && !decodedFrame && watcher()) { result = av_read_frame(inputCtx_, &avPacket); if (result == AVERROR(EAGAIN)) { VLOG(4) << "Decoder is busy..."; + std::this_thread::yield(); result = 0; // reset error, EAGAIN is not an error at all - break; + continue; } else if (result == AVERROR_EOF) { flushStreams(); VLOG(1) << "End of stream"; @@ -499,20 +502,20 @@ int Decoder::getBytes(size_t workingTimeInMs) { // get stream auto stream = findByIndex(avPacket.stream_index); - if (stream == nullptr) { + if (stream == nullptr || !inRange_.test(stream->getIndex())) { av_packet_unref(&avPacket); continue; } - stream->rescalePackage(&avPacket); - AVPacket copyPacket = avPacket; size_t numConsecutiveNoBytes = 0; // it can be only partial decoding of the package bytes do { // decode package - if ((result = processPacket(stream, ©Packet)) < 0) { + bool hasMsg = false; + if ((result = processPacket(stream, ©Packet, &hasMsg)) < 0) { + LOG(ERROR) << "processPacket failed with code: " << result; break; } @@ -525,6 +528,8 @@ int Decoder::getBytes(size_t workingTimeInMs) { numConsecutiveNoBytes = 0; } + decodedFrame |= hasMsg; + copyPacket.size -= result; copyPacket.data += result; } while (copyPacket.size > 0); @@ -533,6 +538,7 @@ int Decoder::getBytes(size_t workingTimeInMs) { if (result < 0) { if (params_.maxPackageErrors != 0 && // check errors ++decodingErrors >= params_.maxPackageErrors) { // reached the limit + LOG(ERROR) << "Exceeding max amount of consecutive package errors"; break; } } else { @@ -546,7 +552,27 @@ int Decoder::getBytes(size_t workingTimeInMs) { av_packet_unref(&avPacket); - return result; + VLOG(2) << "Interrupted loop" + << ", interrupted_ " << interrupted_ << ", inRange_.any() " + << inRange_.any() << ", decodedFrame " << decodedFrame << ", result " + << result; + + // loop can be terminated, either by: + // 1. explcitly iterrupted + // 2. terminated by workable timeout + // 3. unrecoverable error or ENODATA (end of stream) + // 4. decoded frames pts are out of the specified range + // 5. success decoded frame + if (interrupted_) { + return EINTR; + } + if (result != 0) { + return result; + } + if (inRange_.none()) { + return ENODATA; + } + return 0; } Stream* Decoder::findByIndex(int streamIndex) const { @@ -563,17 +589,21 @@ Stream* Decoder::findByType(const MediaFormat& format) const { return nullptr; } -int Decoder::processPacket(Stream* stream, AVPacket* packet) { +int Decoder::processPacket(Stream* stream, AVPacket* packet, bool* hasMsg) { // decode package - int gotFrame = 0; int result; DecoderOutputMessage msg; msg.payload = createByteStorage(0); + int gotFrame = 0; + *hasMsg = false; if ((result = stream->decodeFrame(packet, &gotFrame)) >= 0 && gotFrame && stream->getFrameBytes(&msg, params_.headerOnly) > 0) { // check end offset - if (params_.endOffsetMs <= 0 || - !(outOfRange_ = msg.header.pts > params_.endOffsetMs * 1000)) { + bool endInRange = + params_.endOffset <= 0 || msg.header.pts <= params_.endOffset; + inRange_.set(stream->getIndex(), endInRange); + if (endInRange && msg.header.pts >= params_.startOffset) { + *hasMsg = true; push(std::move(msg)); } } @@ -587,8 +617,10 @@ void Decoder::flushStreams() { while (msg.payload = createByteStorage(0), stream.second->flush(&msg, params_.headerOnly) > 0) { // check end offset - if (params_.endOffsetMs <= 0 || - !(outOfRange_ = msg.header.pts > params_.endOffsetMs * 1000)) { + bool endInRange = + params_.endOffset <= 0 || msg.header.pts <= params_.endOffset; + inRange_.set(stream.second->getIndex(), endInRange); + if (endInRange && msg.header.pts >= params_.startOffset) { push(std::move(msg)); } } diff --git a/torchvision/csrc/cpu/decoder/decoder.h b/torchvision/csrc/cpu/decoder/decoder.h index 971eec10aa4..90fc6d051b2 100644 --- a/torchvision/csrc/cpu/decoder/decoder.h +++ b/torchvision/csrc/cpu/decoder/decoder.h @@ -1,7 +1,7 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once +#include +#include #include "seekable_buffer.h" #include "stream.h" @@ -25,9 +25,10 @@ class Decoder : public MediaDecoder { protected: // function does actual work, derived class calls it in working thread - // periodically. On success method returns 0, ENOADATA on EOF and error on + // periodically. On success method returns 0, ENOADATA on EOF, ETIMEDOUT if + // no frames got decoded in the specified timeout time, and error on // unrecoverable error. - int getBytes(size_t workingTimeInMs = 100); + int getFrame(size_t workingTimeInMs = 100); // Derived class must override method and consume the provided message virtual void push(DecoderOutputMessage&& buffer) = 0; @@ -59,7 +60,7 @@ class Decoder : public MediaDecoder { bool activateStreams(); Stream* findByIndex(int streamIndex) const; Stream* findByType(const MediaFormat& format) const; - int processPacket(Stream* stream, AVPacket* packet); + int processPacket(Stream* stream, AVPacket* packet, bool* hasMsg); void flushStreams(); void cleanUp(); @@ -72,6 +73,6 @@ class Decoder : public MediaDecoder { AVFormatContext* inputCtx_{nullptr}; AVIOContext* avioCtx_{nullptr}; std::unordered_map> streams_; - bool outOfRange_{false}; + std::bitset<64> inRange_; }; } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/defs.h b/torchvision/csrc/cpu/decoder/defs.h index 62854668b90..f42e4652002 100644 --- a/torchvision/csrc/cpu/decoder/defs.h +++ b/torchvision/csrc/cpu/decoder/defs.h @@ -135,7 +135,7 @@ struct MediaFormat { ssize_t num{0}; // time base denominator ssize_t den{1}; - // duration of the stream, in stream time base, if available + // duration of the stream, in miscroseconds, if available ssize_t duration{-1}; }; @@ -151,24 +151,26 @@ struct DecoderParameters { size_t maxPackageErrors{0}; // max allowed consecutive times no bytes are processed. 0 means for infinite. size_t maxProcessNoBytes{0}; - // start offset - ssize_t startOffsetMs{0}; - // end offset - ssize_t endOffsetMs{-1}; + // start offset (us) + ssize_t startOffset{0}; + // end offset (us) + ssize_t endOffset{-1}; // logging id int64_t loggingUuid{0}; // adjust header pts to the epoch time bool convertPtsToWallTime{false}; // indicate if input stream is an encoded image bool isImage{false}; - // what media types should be processed, default none - std::set formats; // listen and wait for new rtmp stream bool listen{false}; // don't copy frame body, only header bool headerOnly{false}; - // seek tolerated accuracy - double seekAccuracySec{1.0}; + // interrupt init method on timeout + bool preventStaleness{true}; + // seek tolerated accuracy (us) + double seekAccuracy{1000000.0}; + // what media types should be processed, default none + std::set formats; }; struct DecoderHeader { @@ -219,27 +221,21 @@ struct DecoderOutputMessage { * Normally input/output parameter @out set to valid, not null buffer pointer, * which indicates "read" call, however there are "seek" modes as well. - * @out != nullptr, @size != 0, @timeoutMs != 0 => read from the current offset - * @size bytes => return number bytes read, 0 if no more bytes available, < 0 - * on error. - - * @out == nullptr, @size == 0, @timeoutMs == 0 => does provider support "seek" - * capability in a first place? return 0 on success, < 0 if "seek" mode is not - * supported. - - * @out == nullptr, @size > 0 => seek the absolute offset == @size, return - * 0 on success and < 0 on error. + * @out != nullptr => read from the current offset, @whence got ignored, + * @size bytes to read => return number bytes got read, 0 if no more bytes + * available, < 0 on error. - * @out == nullptr, @size < 0 => seek the end of the media, return 0 on success - * and < 0 on failure. Provider might support seek doesn't know the media size. + * @out == nullptr, @timeoutMs == 0 => does provider support "seek" + * capability in a first place? @size & @whence got ignored, return 0 on + * success, < 0 if "seek" mode is not supported. - * Additionally if @out is set to null AND @size is set to zero AND - * @timeoutMs is set to zero, caller requests the seek capability of the - * provider, i.e. returns 0 on success and error if provider is not supporting - * seek. + * @out == nullptr, @timeoutMs != 0 => normal seek call + * offset == @size, i.e. @whence = [SEEK_SET, SEEK_CUR, SEEK_END, AVSEEK_SIZE) + * return < 0 on error, 0 if @whence = [SEEK_SET, SEEK_CUR], and position if + * @whence = [SEEK_END, AVSEEK_SIZE]. */ using DecoderInCallback = - std::function; + std::function; using DecoderOutCallback = std::function; diff --git a/torchvision/csrc/cpu/decoder/memory_buffer.cpp b/torchvision/csrc/cpu/decoder/memory_buffer.cpp new file mode 100644 index 00000000000..ef1067003ff --- /dev/null +++ b/torchvision/csrc/cpu/decoder/memory_buffer.cpp @@ -0,0 +1,78 @@ +#include "memory_buffer.h" +#include + +extern "C" { +#include +} + +namespace ffmpeg { + +MemoryBuffer::MemoryBuffer(const uint8_t* buffer, size_t size) + : buffer_(buffer), len_(size) {} + +int MemoryBuffer::read(uint8_t* buf, int size) { + if (pos_ < len_) { + auto available = std::min(int(len_ - pos_), size); + memcpy(buf, buffer_ + pos_, available); + pos_ += available; + return available; + } + + return 0; +} + +int64_t MemoryBuffer::seek(int64_t offset, int whence) { + if (whence & AVSEEK_SIZE) { + return len_; + } + + // remove force flag + whence &= ~AVSEEK_FORCE; + + switch (whence) { + case SEEK_SET: + if (offset >= 0 && offset <= len_) { + pos_ = offset; + return 0; + } + break; + case SEEK_END: + if (len_ + offset >= 0 && len_ + offset <= len_) { + pos_ = len_ + offset; + return pos_; + } + break; + case SEEK_CUR: + if (pos_ + offset > 0 && pos_ + offset <= len_) { + pos_ += offset; + return 0; + } + break; + default: + LOG(ERROR) << "Unknown whence flag gets provided: " << whence; + } + return AVERROR(EINVAL); // we have no idea what the media size is +} + +/* static */ +DecoderInCallback MemoryBuffer::getCallback( + const uint8_t* buffer, + size_t size) { + MemoryBuffer object(buffer, size); + return + [object](uint8_t* out, int size, int whence, uint64_t timeoutMs) mutable + -> int { + if (out) { // see defs.h file + // read mode + return object.read(out, size); + } + // seek mode + if (!timeoutMs) { + // seek capabilty, yes - supported + return 0; + } + return object.seek(size, whence); + }; +} + +} // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/memory_buffer.h b/torchvision/csrc/cpu/decoder/memory_buffer.h new file mode 100644 index 00000000000..d1b2571477b --- /dev/null +++ b/torchvision/csrc/cpu/decoder/memory_buffer.h @@ -0,0 +1,25 @@ +#pragma once + +#include "defs.h" + +namespace ffmpeg { + +/** + * Class uses external memory buffer and implements a seekable interface. + */ +class MemoryBuffer { + public: + explicit MemoryBuffer(const uint8_t* buffer, size_t size); + int64_t seek(int64_t offset, int whence); + int read(uint8_t* buf, int size); + + // static constructor for decoder callback. + static DecoderInCallback getCallback(const uint8_t* buffer, size_t size); + + private: + const uint8_t* buffer_; // set at construction time + ssize_t pos_{0}; // current position + ssize_t len_{0}; // bytes in buffer +}; + +} // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/seekable_buffer.cpp b/torchvision/csrc/cpu/decoder/seekable_buffer.cpp index 8d159b789bf..a621988fef6 100644 --- a/torchvision/csrc/cpu/decoder/seekable_buffer.cpp +++ b/torchvision/csrc/cpu/decoder/seekable_buffer.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "seekable_buffer.h" #include #include @@ -10,17 +8,18 @@ extern "C" { namespace ffmpeg { -bool SeekableBuffer::init( - DecoderInCallback&& in, - ssize_t minSize, - ssize_t maxSize, - uint64_t timeoutMs) { +constexpr size_t kMinSeekBufferSize = 64; + +void SeekableBuffer::init(DecoderInCallback&& in) { inCallback_ = std::forward(in); - len_ = minSize; - buffer_.resize(len_); + isSeekable_ = inCallback_(nullptr, 0, 0, 0); +} + +bool SeekableBuffer::detect(uint64_t timeoutMs, ImageType* type) { + buffer_.resize(kMinSeekBufferSize); pos_ = 0; end_ = 0; - eof_ = 0; + eof_ = false; auto end = std::chrono::steady_clock::now() + std::chrono::milliseconds(timeoutMs); @@ -29,17 +28,14 @@ bool SeekableBuffer::init( }; bool hasTime = false; - while (!eof_ && end_ < maxSize && (hasTime = watcher())) { + while (!eof_ && end_ < buffer_.size() && (hasTime = watcher())) { // lets read all bytes into available buffer - auto res = inCallback_(buffer_.data() + end_, len_ - end_, timeoutMs); + auto res = + inCallback_(buffer_.data() + end_, buffer_.size() - end_, 0, timeoutMs); if (res > 0) { end_ += res; - if (end_ == len_) { - len_ = std::min(len_ * 4, maxSize); - buffer_.resize(len_); - } } else if (res == 0) { - eof_ = 1; + eof_ = true; } else { // error return false; @@ -52,37 +48,44 @@ bool SeekableBuffer::init( if (buffer_.size() > 2 && buffer_[0] == 0xFF && buffer_[1] == 0xD8 && buffer_[2] == 0xFF) { - imageType_ = ImageType::JPEG; + *type = ImageType::JPEG; } else if ( buffer_.size() > 3 && buffer_[1] == 'P' && buffer_[2] == 'N' && buffer_[3] == 'G') { - imageType_ = ImageType::PNG; + *type = ImageType::PNG; } else if ( buffer_.size() > 1 && ((buffer_[0] == 0x49 && buffer_[1] == 0x49) || (buffer_[0] == 0x4D && buffer_[1] == 0x4D))) { - imageType_ = ImageType::TIFF; + *type = ImageType::TIFF; + } else { + *type = ImageType::UNKNOWN; } + if (isSeekable_) { + pos_ = end_ = 0; + eof_ = false; + std::vector().swap(buffer_); + // reset callback + if (inCallback_(nullptr, 0, SEEK_SET, timeoutMs)) { + return false; + } + } return true; } int SeekableBuffer::read(uint8_t* buf, int size, uint64_t timeoutMs) { - // 1. pos_ < end_ if (pos_ < end_) { + // read cached bytes for non-seekable callback auto available = std::min(int(end_ - pos_), size); memcpy(buf, buffer_.data() + pos_, available); pos_ += available; return available; } else if (!eof_) { - auto res = inCallback_(buf, size, timeoutMs); // read through + // normal read (see defs.h file), i.e. @buf != null + auto res = inCallback_(buf, size, 0, timeoutMs); // read through if (res > 0) { - pos_ += res; - if (pos_ > end_ && !buffer_.empty()) { - std::vector().swap(buffer_); - } - } else if (res == 0) { - eof_ = 1; + pos_ += res; // keep the track of the absolute possition. } return res; } else { @@ -91,54 +94,7 @@ int SeekableBuffer::read(uint8_t* buf, int size, uint64_t timeoutMs) { } int64_t SeekableBuffer::seek(int64_t offset, int whence, uint64_t timeoutMs) { - // remove force flag - whence &= ~AVSEEK_FORCE; - // get size request - int size = whence & AVSEEK_SIZE; - // remove size flag - whence &= ~AVSEEK_SIZE; - - if (size) { - return eof_ ? end_ : AVERROR(EINVAL); - } else { - switch (whence) { - case SEEK_SET: - if (offset < 0) { - return AVERROR(EINVAL); - } - if (offset <= end_) { - pos_ = offset; - return pos_; - } - if (!inCallback_(0, offset, timeoutMs)) { - pos_ = offset; - return 0; - } - break; - case SEEK_END: - if (eof_ && pos_ <= end_ && offset < 0 && end_ + offset >= 0) { - pos_ = end_ + offset; - return 0; - } - break; - case SEEK_CUR: - if (pos_ + offset < 0) { - return AVERROR(EINVAL); - } - if (pos_ + offset <= end_) { - pos_ += offset; - return 0; - } - if (!inCallback_(0, pos_ + offset, timeoutMs)) { - pos_ += offset; - return 0; - } - break; - default: - LOG(ERROR) << "Unknown whence flag gets provided: " << whence; - } - } - return AVERROR(EINVAL); // we have no idea what the media size is + return inCallback_(nullptr, offset, whence, timeoutMs); } void SeekableBuffer::shutdown() { diff --git a/torchvision/csrc/cpu/decoder/seekable_buffer.h b/torchvision/csrc/cpu/decoder/seekable_buffer.h index e8ba327e4ea..99761b173f8 100644 --- a/torchvision/csrc/cpu/decoder/seekable_buffer.h +++ b/torchvision/csrc/cpu/decoder/seekable_buffer.h @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "defs.h" @@ -20,27 +18,21 @@ enum class ImageType { class SeekableBuffer { public: - // try to fill out buffer, returns true if EOF detected (seek will supported) - bool init( - DecoderInCallback&& in, - ssize_t minSize, - ssize_t maxSize, - uint64_t timeoutMs); + // sets callback + void init(DecoderInCallback&& in); + // try to detect image type + bool detect(uint64_t timeoutMs, ImageType* type); int read(uint8_t* buf, int size, uint64_t timeoutMs); int64_t seek(int64_t offset, int whence, uint64_t timeoutMs); void shutdown(); - ImageType getImageType() const { - return imageType_; - } private: DecoderInCallback inCallback_; std::vector buffer_; // resized at init time - ssize_t len_{0}; // current buffer size ssize_t pos_{0}; // current position (SEEK_CUR iff pos_ < end_) - ssize_t end_{0}; // bytes in buffer [0, buffer_.size()] - ssize_t eof_{0}; // indicates the EOF - ImageType imageType_{ImageType::UNKNOWN}; + ssize_t end_{0}; // current buffer size + bool eof_{0}; // indicates the EOF + bool isSeekable_{false}; // is callback seekable }; } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/stream.cpp b/torchvision/csrc/cpu/decoder/stream.cpp index 767136657b6..a3238280b58 100644 --- a/torchvision/csrc/cpu/decoder/stream.cpp +++ b/torchvision/csrc/cpu/decoder/stream.cpp @@ -1,15 +1,9 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "stream.h" #include #include "util.h" namespace ffmpeg { -namespace { -const size_t kDecoderHeaderSize = sizeof(DecoderHeader); -} - Stream::Stream( AVFormatContext* inputCtx, MediaFormat format, @@ -62,21 +56,28 @@ int Stream::openCodec() { frame_ = av_frame_alloc(); + // always convert to us format_.num = inputCtx_->streams[format_.stream]->time_base.num; format_.den = inputCtx_->streams[format_.stream]->time_base.den; - format_.duration = inputCtx_->streams[format_.stream]->duration; - return initFormat(); -} - -// rescale package -void Stream::rescalePackage(AVPacket* packet) { - if (codecCtx_->time_base.num != 0) { - av_packet_rescale_ts( - packet, - inputCtx_->streams[format_.stream]->time_base, - codecCtx_->time_base); + switch (format_.type) { + case TYPE_VIDEO: + fps_ = av_q2d(av_guess_frame_rate( + inputCtx_, inputCtx_->streams[format_.stream], nullptr)); + break; + case TYPE_AUDIO: + fps_ = codecCtx_->sample_rate; + break; + default: + fps_ = 30.0; } + + format_.duration = av_rescale_q( + inputCtx_->streams[format_.stream]->duration, + inputCtx_->streams[format_.stream]->time_base, + AV_TIME_BASE_Q); + + return initFormat(); } int Stream::analyzePacket(const AVPacket* packet, int* gotFramePtr) { @@ -137,29 +138,90 @@ int Stream::flush(DecoderOutputMessage* out, bool headerOnly) { } else if ((result = fillBuffer(out, true, headerOnly)) > 0) { return result; } + avcodec_flush_buffers(codecCtx_); return result; } int Stream::fillBuffer(DecoderOutputMessage* out, bool flush, bool headerOnly) { int result = -1; if (!codecCtx_) { - LOG(INFO) << "Codec is not initialized"; + LOG(ERROR) << "Codec is not initialized"; return result; } - // assign message - setHeader(&out->header); + // estimate the required storage + int bytes; + if ((bytes = estimateBytes(flush)) < 0) { + return bytes; + } + if (flush) { + // grab all audio bytes (video & subtitle sampler don't have cached bytes) + int processed = 0; + do { + bytes += processed; + out->payload->ensure(bytes); + if ((processed = copyFrameBytes(out->payload.get(), flush)) < 0) { + return processed; + } + } while (processed); + + // set header + if (out->payload->length()) { + setHeader(&out->header, flush); + } + bytes = out->payload->length(); + } else { + out->payload->ensure(bytes); + if ((bytes = copyFrameBytes(out->payload.get(), flush)) > 0) { + setHeader(&out->header, flush); + } + } if (headerOnly) { - return sizeof(out->header); + out->payload.reset(); } + return bytes; +} - // init sampler, if any and return required bytes - if ((result = estimateBytes(flush)) < 0) { - return result; +void Stream::setHeader(DecoderHeader* header, bool flush) { + header->seqno = numGenerator_++; + + setFramePts(header, flush); + + if (convertPtsToWallTime_) { + keeper_.adjust(header->pts); + } + + header->format = format_; + header->keyFrame = 0; + header->fps = std::numeric_limits::quiet_NaN(); +} + +void Stream::setFramePts(DecoderHeader* header, bool flush) { + if (flush) { + header->pts = nextPts_; // already in us + } else { + header->pts = av_frame_get_best_effort_timestamp(frame_); + if (header->pts == AV_NOPTS_VALUE) { + header->pts = nextPts_; + } else { + header->pts = av_rescale_q( + header->pts, + inputCtx_->streams[format_.stream]->time_base, + AV_TIME_BASE_Q); + } + + switch (format_.type) { + case TYPE_AUDIO: + nextPts_ = header->pts + frame_->nb_samples * AV_TIME_BASE / fps_; + break; + case TYPE_VIDEO: + nextPts_ = header->pts + AV_TIME_BASE / fps_; + break; + default: + nextPts_ = header->pts; + } } - out->payload->ensure(result); - return copyFrameBytes(out->payload.get(), flush); } } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/stream.h b/torchvision/csrc/cpu/decoder/stream.h index fd83b90428c..be16cabe519 100644 --- a/torchvision/csrc/cpu/decoder/stream.h +++ b/torchvision/csrc/cpu/decoder/stream.h @@ -1,9 +1,8 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include #include "defs.h" +#include "time_keeper.h" extern "C" { #include @@ -37,8 +36,6 @@ class Stream { int getFrameBytes(DecoderOutputMessage* out, bool headerOnly); // returns number decoded/sampled bytes int flush(DecoderOutputMessage* out, bool headerOnly); - // rescale package - void rescalePackage(AVPacket* packet); // return media format MediaFormat getMediaFormat() const { return format_; @@ -50,10 +47,12 @@ class Stream { virtual int analyzePacket(const AVPacket* packet, int* gotFramePtr); // returns number decoded/sampled bytes, or negative error virtual int copyFrameBytes(ByteStorage* out, bool flush) = 0; - // initialize codec, returns output buffer size, or negative error + // estimates bytes in frame, returns output buffer size, or negative error virtual int estimateBytes(bool flush) = 0; // sets output format - virtual void setHeader(DecoderHeader* header) = 0; + virtual void setHeader(DecoderHeader* header, bool flush); + // set frame pts + virtual void setFramePts(DecoderHeader* header, bool flush); // finds codec virtual AVCodec* findCodec(AVCodecContext* ctx); @@ -69,6 +68,10 @@ class Stream { AVFrame* frame_{nullptr}; std::atomic numGenerator_{0}; + TimeKeeper keeper_; + // estimated next frame pts for flushing the last frame + int64_t nextPts_{0}; + double fps_{30.}; }; } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/subtitle_sampler.cpp b/torchvision/csrc/cpu/decoder/subtitle_sampler.cpp index 02859c19187..b89ef8f1b86 100644 --- a/torchvision/csrc/cpu/decoder/subtitle_sampler.cpp +++ b/torchvision/csrc/cpu/decoder/subtitle_sampler.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "subtitle_sampler.h" #include "util.h" diff --git a/torchvision/csrc/cpu/decoder/subtitle_sampler.h b/torchvision/csrc/cpu/decoder/subtitle_sampler.h index 4846fe4d7c5..298e48d591f 100644 --- a/torchvision/csrc/cpu/decoder/subtitle_sampler.h +++ b/torchvision/csrc/cpu/decoder/subtitle_sampler.h @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "defs.h" diff --git a/torchvision/csrc/cpu/decoder/subtitle_stream.cpp b/torchvision/csrc/cpu/decoder/subtitle_stream.cpp index b699a0507cf..c8a1d4000a2 100644 --- a/torchvision/csrc/cpu/decoder/subtitle_stream.cpp +++ b/torchvision/csrc/cpu/decoder/subtitle_stream.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "subtitle_stream.h" #include #include @@ -73,13 +71,13 @@ int SubtitleStream::analyzePacket(const AVPacket* packet, int* gotFramePtr) { return result; } -int SubtitleStream::estimateBytes(bool flush) { +int SubtitleStream::estimateBytes(bool) { if (!(sampler_.getInputFormat().subtitle == *codecCtx_)) { // - reinit sampler SamplerParameters params; params.type = MediaType::TYPE_SUBTITLE; toSubtitleFormat(params.in.subtitle, *codecCtx_); - if (flush || !sampler_.init(params)) { + if (!sampler_.init(params)) { return -1; } @@ -92,17 +90,8 @@ int SubtitleStream::copyFrameBytes(ByteStorage* out, bool flush) { return sampler_.sample(flush ? nullptr : &sub_, out); } -void SubtitleStream::setHeader(DecoderHeader* header) { - header->seqno = numGenerator_++; - +void SubtitleStream::setFramePts(DecoderHeader* header, bool) { header->pts = sub_.pts; // already in us - - if (convertPtsToWallTime_) { - keeper_.adjust(header->pts); - } - - header->keyFrame = 0; - header->fps = std::numeric_limits::quiet_NaN(); - header->format = format_; } + } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/subtitle_stream.h b/torchvision/csrc/cpu/decoder/subtitle_stream.h index 8669f15e0ce..1fae7297c41 100644 --- a/torchvision/csrc/cpu/decoder/subtitle_stream.h +++ b/torchvision/csrc/cpu/decoder/subtitle_stream.h @@ -1,10 +1,7 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "stream.h" #include "subtitle_sampler.h" -#include "time_keeper.h" namespace ffmpeg { @@ -25,7 +22,7 @@ class SubtitleStream : public Stream { ~SubtitleStream() override; protected: - void setHeader(DecoderHeader* header) override; + void setFramePts(DecoderHeader* header, bool flush) override; private: int initFormat() override; @@ -36,7 +33,6 @@ class SubtitleStream : public Stream { private: SubtitleSampler sampler_; - TimeKeeper keeper_; AVSubtitleKeeper sub_; }; diff --git a/torchvision/csrc/cpu/decoder/sync_decoder.cpp b/torchvision/csrc/cpu/decoder/sync_decoder.cpp index 6387837218e..2bd81209d92 100644 --- a/torchvision/csrc/cpu/decoder/sync_decoder.cpp +++ b/torchvision/csrc/cpu/decoder/sync_decoder.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "sync_decoder.h" #include @@ -41,7 +39,7 @@ size_t SyncDecoder::VectorByteStorage::length() const { size_t SyncDecoder::VectorByteStorage::tail() const { auto size = buffer_.size(); - CHECK_LE(offset_ + length_, buffer_.size()); + CHECK_LE(offset_ + length_, size); return size - offset_ - length_; } @@ -66,16 +64,21 @@ int SyncDecoder::decode(DecoderOutputMessage* out, uint64_t timeoutMs) { } if (queue_.empty()) { - int result = getBytes(timeoutMs); + int result = getFrame(timeoutMs); + // assign EOF eof_ = result == ENODATA; - + // check unrecoverable error, any error but ENODATA if (result && result != ENODATA) { return result; } // still empty if (queue_.empty()) { - return ETIMEDOUT; + if (eof_) { + return ENODATA; + } else { + return ETIMEDOUT; + } } } diff --git a/torchvision/csrc/cpu/decoder/sync_decoder.h b/torchvision/csrc/cpu/decoder/sync_decoder.h index 76c347fe707..ae5168734dd 100644 --- a/torchvision/csrc/cpu/decoder/sync_decoder.h +++ b/torchvision/csrc/cpu/decoder/sync_decoder.h @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include diff --git a/torchvision/csrc/cpu/decoder/sync_decoder_test.cpp b/torchvision/csrc/cpu/decoder/sync_decoder_test.cpp index ee0fe3fcf3c..e6528bba216 100644 --- a/torchvision/csrc/cpu/decoder/sync_decoder_test.cpp +++ b/torchvision/csrc/cpu/decoder/sync_decoder_test.cpp @@ -1,7 +1,6 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include #include +#include "memory_buffer.h" #include "sync_decoder.h" using namespace ffmpeg; @@ -10,7 +9,8 @@ TEST(SyncDecoder, Test) { SyncDecoder decoder; DecoderParameters params; params.timeoutMs = 10000; - params.startOffsetMs = 1000; + params.startOffset = 1000000; + params.seekAccuracy = 100000; params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; params.uri = "pytorch/vision/test/assets/videos/R6llTwEh07w.mp4"; CHECK(decoder.init(params, nullptr)); @@ -20,3 +20,50 @@ TEST(SyncDecoder, Test) { } decoder.shutdown(); } + +TEST(SyncDecoder, TestHeadersOnly) { + SyncDecoder decoder; + DecoderParameters params; + params.timeoutMs = 10000; + params.startOffset = 1000000; + params.seekAccuracy = 100000; + params.headerOnly = true; + params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; + params.uri = + "pytorch/vision/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi"; + CHECK(decoder.init(params, nullptr)); + DecoderOutputMessage out; + while (0 == decoder.decode(&out, 100)) { + LOG(INFO) << "Decoded frame, type: " << out.header.format.type + << ", timestamp(us): " << out.header.pts; + } + decoder.shutdown(); +} + +TEST(SyncDecoder, TestMemoryBuffer) { + SyncDecoder decoder; + DecoderParameters params; + params.timeoutMs = 10000; + params.startOffset = 1000000; + params.endOffset = 9000000; + params.seekAccuracy = 10000; + params.formats = {MediaFormat(), MediaFormat(0), MediaFormat('0')}; + + FILE* f = fopen("pytorch/vision/test/assets/videos/R6llTwEh07w.mp4", "rb"); + CHECK(f != nullptr); + fseek(f, 0, SEEK_END); + std::vector buffer(ftell(f)); + rewind(f); + CHECK_EQ(buffer.size(), fread(buffer.data(), 1, buffer.size(), f)); + fclose(f); + CHECK(decoder.init( + params, MemoryBuffer::getCallback(buffer.data(), buffer.size()))); + DecoderOutputMessage out; + while (0 == decoder.decode(&out, 100)) { + LOG(INFO) << "Decoded frame, timestamp(us): " << out.header.pts + << ", num: " << out.header.format.num + << ", den: " << out.header.format.den + << ", duration(us): " << out.header.format.duration; + } + decoder.shutdown(); +} diff --git a/torchvision/csrc/cpu/decoder/time_keeper.cpp b/torchvision/csrc/cpu/decoder/time_keeper.cpp index a0da56a1f64..c1ca6c9a1ef 100644 --- a/torchvision/csrc/cpu/decoder/time_keeper.cpp +++ b/torchvision/csrc/cpu/decoder/time_keeper.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "time_keeper.h" extern "C" { diff --git a/torchvision/csrc/cpu/decoder/time_keeper.h b/torchvision/csrc/cpu/decoder/time_keeper.h index c9d06025b2c..19cc027409b 100644 --- a/torchvision/csrc/cpu/decoder/time_keeper.h +++ b/torchvision/csrc/cpu/decoder/time_keeper.h @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include diff --git a/torchvision/csrc/cpu/decoder/util.cpp b/torchvision/csrc/cpu/decoder/util.cpp index 6ae888838ea..ba19cf582b0 100644 --- a/torchvision/csrc/cpu/decoder/util.cpp +++ b/torchvision/csrc/cpu/decoder/util.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "util.h" #include diff --git a/torchvision/csrc/cpu/decoder/util.h b/torchvision/csrc/cpu/decoder/util.h index 6a985d78559..cc64d8944e4 100644 --- a/torchvision/csrc/cpu/decoder/util.h +++ b/torchvision/csrc/cpu/decoder/util.h @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "defs.h" diff --git a/torchvision/csrc/cpu/decoder/video_sampler.cpp b/torchvision/csrc/cpu/decoder/video_sampler.cpp index 1a91c82a371..4b7d078ebd7 100644 --- a/torchvision/csrc/cpu/decoder/video_sampler.cpp +++ b/torchvision/csrc/cpu/decoder/video_sampler.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "video_sampler.h" #include #include "util.h" diff --git a/torchvision/csrc/cpu/decoder/video_sampler.h b/torchvision/csrc/cpu/decoder/video_sampler.h index 73997c213e1..85161307257 100644 --- a/torchvision/csrc/cpu/decoder/video_sampler.h +++ b/torchvision/csrc/cpu/decoder/video_sampler.h @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "defs.h" diff --git a/torchvision/csrc/cpu/decoder/video_stream.cpp b/torchvision/csrc/cpu/decoder/video_stream.cpp index 9c6b77d0bfc..f1faac2fbdc 100644 --- a/torchvision/csrc/cpu/decoder/video_stream.cpp +++ b/torchvision/csrc/cpu/decoder/video_stream.cpp @@ -1,5 +1,3 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #include "video_stream.h" #include #include "util.h" @@ -11,12 +9,23 @@ bool operator==(const VideoFormat& x, const AVFrame& y) { return x.width == y.width && x.height == y.height && x.format == y.format; } +bool operator==(const VideoFormat& x, const AVCodecContext& y) { + return x.width == y.width && x.height == y.height && x.format == y.pix_fmt; +} + VideoFormat& toVideoFormat(VideoFormat& x, const AVFrame& y) { x.width = y.width; x.height = y.height; x.format = y.format; return x; } + +VideoFormat& toVideoFormat(VideoFormat& x, const AVCodecContext& y) { + x.width = y.width; + x.height = y.height; + x.format = y.pix_fmt; + return x; +} } // namespace VideoStream::VideoStream( @@ -79,12 +88,14 @@ int VideoStream::initFormat() { int VideoStream::estimateBytes(bool flush) { ensureSampler(); // check if input format gets changed - if (!flush && !(sampler_->getInputFormat().video == *frame_)) { + if (flush ? !(sampler_->getInputFormat().video == *codecCtx_) + : !(sampler_->getInputFormat().video == *frame_)) { // - reinit sampler SamplerParameters params; params.type = format_.type; params.out = format_.format; - toVideoFormat(params.in.video, *frame_); + flush ? toVideoFormat(params.in.video, *codecCtx_) + : toVideoFormat(params.in.video, *frame_); if (!sampler_->init(params)) { return -1; } @@ -108,36 +119,12 @@ int VideoStream::copyFrameBytes(ByteStorage* out, bool flush) { return sampler_->sample(flush ? nullptr : frame_, out); } -void VideoStream::setHeader(DecoderHeader* header) { - header->seqno = numGenerator_++; - - if (codecCtx_->time_base.num != 0) { - header->pts = av_rescale_q( - av_frame_get_best_effort_timestamp(frame_), - codecCtx_->time_base, - AV_TIME_BASE_Q); - } else { - // If the codec time_base is missing then we would've skipped the - // rescalePackage step to rescale to codec time_base, so here we can - // rescale straight from the stream time_base into AV_TIME_BASE_Q. - header->pts = av_rescale_q( - av_frame_get_best_effort_timestamp(frame_), - inputCtx_->streams[format_.stream]->time_base, - AV_TIME_BASE_Q); - } - - if (convertPtsToWallTime_) { - keeper_.adjust(header->pts); - } +void VideoStream::setHeader(DecoderHeader* header, bool flush) { + Stream::setHeader(header, flush); header->keyFrame = frame_->key_frame; - auto fpsRational = inputCtx_->streams[format_.stream]->avg_frame_rate; - if (fpsRational.den) { - header->fps = av_q2d(fpsRational); - } else { - header->fps = std::numeric_limits::quiet_NaN(); - } - header->format = format_; + header->fps = av_q2d(av_guess_frame_rate( + inputCtx_, inputCtx_->streams[format_.stream], nullptr)); } } // namespace ffmpeg diff --git a/torchvision/csrc/cpu/decoder/video_stream.h b/torchvision/csrc/cpu/decoder/video_stream.h index af1e3fb960f..ec833b867c3 100644 --- a/torchvision/csrc/cpu/decoder/video_stream.h +++ b/torchvision/csrc/cpu/decoder/video_stream.h @@ -1,9 +1,6 @@ -// Copyright 2004-present Facebook. All Rights Reserved. - #pragma once #include "stream.h" -#include "time_keeper.h" #include "video_sampler.h" namespace ffmpeg { @@ -26,13 +23,12 @@ class VideoStream : public Stream { int initFormat() override; int estimateBytes(bool flush) override; int copyFrameBytes(ByteStorage* out, bool flush) override; - void setHeader(DecoderHeader* header) override; + void setHeader(DecoderHeader* header, bool flush) override; void ensureSampler(); private: std::unique_ptr sampler_; - TimeKeeper keeper_; int64_t loggingUuid_{0}; }; diff --git a/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.cpp b/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.cpp deleted file mode 100644 index 24aecacf946..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.cpp +++ /dev/null @@ -1,118 +0,0 @@ -#include "FfmpegAudioSampler.h" -#include -#include "FfmpegUtil.h" - -using namespace std; - -FfmpegAudioSampler::FfmpegAudioSampler( - const AudioFormat& in, - const AudioFormat& out) - : inFormat_(in), outFormat_(out) {} - -FfmpegAudioSampler::~FfmpegAudioSampler() { - if (swrContext_) { - swr_free(&swrContext_); - } -} - -int FfmpegAudioSampler::init() { - swrContext_ = swr_alloc_set_opts( - nullptr, // we're allocating a new context - av_get_default_channel_layout(outFormat_.channels), // out_ch_layout - static_cast(outFormat_.format), // out_sample_fmt - outFormat_.samples, // out_sample_rate - av_get_default_channel_layout(inFormat_.channels), // in_ch_layout - static_cast(inFormat_.format), // in_sample_fmt - inFormat_.samples, // in_sample_rate - 0, // log_offset - nullptr); // log_ctx - if (swrContext_ == nullptr) { - LOG(ERROR) << "swr_alloc_set_opts fails"; - return -1; - } - int result = 0; - if ((result = swr_init(swrContext_)) < 0) { - LOG(ERROR) << "swr_init failed, err: " << ffmpeg_util::getErrorDesc(result) - << ", in -> format: " << inFormat_.format - << ", channels: " << inFormat_.channels - << ", samples: " << inFormat_.samples - << ", out -> format: " << outFormat_.format - << ", channels: " << outFormat_.channels - << ", samples: " << outFormat_.samples; - return -1; - } - return 0; -} - -int64_t FfmpegAudioSampler::getSampleBytes(const AVFrame* frame) const { - auto outSamples = getOutNumSamples(frame->nb_samples); - - return av_samples_get_buffer_size( - nullptr, - outFormat_.channels, - outSamples, - static_cast(outFormat_.format), - 1); -} - -// https://www.ffmpeg.org/doxygen/3.2/group__lswr.html -unique_ptr FfmpegAudioSampler::sample(const AVFrame* frame) { - if (!frame) { - return nullptr; // no flush for videos - } - - auto inNumSamples = frame->nb_samples; - auto outNumSamples = getOutNumSamples(frame->nb_samples); - - auto outSampleSize = getSampleBytes(frame); - AvDataPtr frameData(static_cast(av_malloc(outSampleSize))); - - uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS]; - int result = 0; - if ((result = av_samples_fill_arrays( - outPlanes, - nullptr, // linesize is not needed - frameData.get(), - outFormat_.channels, - outNumSamples, - static_cast(outFormat_.format), - 1)) < 0) { - LOG(ERROR) << "av_samples_fill_arrays failed, err: " - << ffmpeg_util::getErrorDesc(result) - << ", outNumSamples: " << outNumSamples - << ", format: " << outFormat_.format; - return nullptr; - } - - if ((result = swr_convert( - swrContext_, - &outPlanes[0], - outNumSamples, - (const uint8_t**)&frame->data[0], - inNumSamples)) < 0) { - LOG(ERROR) << "swr_convert faield, err: " - << ffmpeg_util::getErrorDesc(result); - return nullptr; - } - // result returned by swr_convert is the No. of actual output samples. - // So update the buffer size using av_samples_get_buffer_size - result = av_samples_get_buffer_size( - nullptr, - outFormat_.channels, - result, - static_cast(outFormat_.format), - 1); - - return make_unique(std::move(frameData), result, 0); -} -/* -Because of decoding delay, the returned value is an upper bound of No. of -output samples -*/ -int64_t FfmpegAudioSampler::getOutNumSamples(int inNumSamples) const { - return av_rescale_rnd( - swr_get_delay(swrContext_, inFormat_.samples) + inNumSamples, - outFormat_.samples, - inFormat_.samples, - AV_ROUND_UP); -} diff --git a/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.h b/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.h deleted file mode 100644 index 767a5ca6e4f..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegAudioSampler.h +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once - -#include "FfmpegSampler.h" - -#define AVRESAMPLE_MAX_CHANNELS 32 - -/** - * Class transcode audio frames from one format into another - */ -class FfmpegAudioSampler : public FfmpegSampler { - public: - explicit FfmpegAudioSampler(const AudioFormat& in, const AudioFormat& out); - ~FfmpegAudioSampler() override; - - int init() override; - - int64_t getSampleBytes(const AVFrame* frame) const; - // FfmpegSampler overrides - // returns number of bytes of the sampled data - std::unique_ptr sample(const AVFrame* frame) override; - - const AudioFormat& getInFormat() const { - return inFormat_; - } - - private: - int64_t getOutNumSamples(int inNumSamples) const; - - AudioFormat inFormat_; - AudioFormat outFormat_; - SwrContext* swrContext_{nullptr}; -}; diff --git a/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.cpp b/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.cpp deleted file mode 100644 index b5b1e2fbda5..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.cpp +++ /dev/null @@ -1,103 +0,0 @@ -#include "FfmpegAudioStream.h" -#include "FfmpegUtil.h" - -using namespace std; - -namespace { - -bool operator==(const AudioFormat& x, const AVCodecContext& y) { - return x.samples == y.sample_rate && x.channels == y.channels && - x.format == y.sample_fmt; -} - -AudioFormat& toAudioFormat( - AudioFormat& audioFormat, - const AVCodecContext& codecCtx) { - audioFormat.samples = codecCtx.sample_rate; - audioFormat.channels = codecCtx.channels; - audioFormat.format = codecCtx.sample_fmt; - - return audioFormat; -} - -} // namespace - -FfmpegAudioStream::FfmpegAudioStream( - AVFormatContext* inputCtx, - int index, - enum AVMediaType avMediaType, - MediaFormat mediaFormat, - double seekFrameMargin) - : FfmpegStream(inputCtx, index, avMediaType, seekFrameMargin), - mediaFormat_(mediaFormat) {} - -FfmpegAudioStream::~FfmpegAudioStream() {} - -void FfmpegAudioStream::checkStreamDecodeParams() { - auto timeBase = getTimeBase(); - if (timeBase.first > 0) { - CHECK_EQ(timeBase.first, inputCtx_->streams[index_]->time_base.num); - CHECK_EQ(timeBase.second, inputCtx_->streams[index_]->time_base.den); - } -} - -void FfmpegAudioStream::updateStreamDecodeParams() { - auto timeBase = getTimeBase(); - if (timeBase.first == 0) { - mediaFormat_.format.audio.timeBaseNum = - inputCtx_->streams[index_]->time_base.num; - mediaFormat_.format.audio.timeBaseDen = - inputCtx_->streams[index_]->time_base.den; - } - mediaFormat_.format.audio.duration = inputCtx_->streams[index_]->duration; -} - -int FfmpegAudioStream::initFormat() { - AudioFormat& format = mediaFormat_.format.audio; - - if (format.samples == 0) { - format.samples = codecCtx_->sample_rate; - } - if (format.channels == 0) { - format.channels = codecCtx_->channels; - } - if (format.format == AV_SAMPLE_FMT_NONE) { - format.format = codecCtx_->sample_fmt; - VLOG(2) << "set stream format sample_fmt: " << format.format; - } - - checkStreamDecodeParams(); - - updateStreamDecodeParams(); - - if (format.samples > 0 && format.channels > 0 && - format.format != AV_SAMPLE_FMT_NONE) { - return 0; - } else { - return -1; - } -} - -unique_ptr FfmpegAudioStream::sampleFrameData() { - AudioFormat& audioFormat = mediaFormat_.format.audio; - - if (!sampler_ || !(sampler_->getInFormat() == *codecCtx_)) { - AudioFormat newInFormat; - newInFormat = toAudioFormat(newInFormat, *codecCtx_); - sampler_ = make_unique(newInFormat, audioFormat); - VLOG(1) << "Set sampler input audio format" - << ", samples: " << newInFormat.samples - << ", channels: " << newInFormat.channels - << ", format: " << newInFormat.format - << " : output audio sampler format" - << ", samples: " << audioFormat.samples - << ", channels: " << audioFormat.channels - << ", format: " << audioFormat.format; - int ret = sampler_->init(); - if (ret < 0) { - VLOG(1) << "Fail to initialize audio sampler"; - return nullptr; - } - } - return sampler_->sample(frame_); -} diff --git a/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.h b/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.h deleted file mode 100644 index 1d4f7a2f2ee..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegAudioStream.h +++ /dev/null @@ -1,54 +0,0 @@ -#pragma once - -#include -#include "FfmpegAudioSampler.h" -#include "FfmpegStream.h" - -/** - * Class uses FFMPEG library to decode one video stream. - */ -class FfmpegAudioStream : public FfmpegStream { - public: - explicit FfmpegAudioStream( - AVFormatContext* inputCtx, - int index, - enum AVMediaType avMediaType, - MediaFormat mediaFormat, - double seekFrameMargin); - - ~FfmpegAudioStream() override; - - // FfmpegStream overrides - MediaType getMediaType() const override { - return MediaType::TYPE_AUDIO; - } - - FormatUnion getMediaFormat() const override { - return mediaFormat_.format; - } - - int64_t getStartPts() const override { - return mediaFormat_.format.audio.startPts; - } - int64_t getEndPts() const override { - return mediaFormat_.format.audio.endPts; - } - // return numerator and denominator of time base - std::pair getTimeBase() const { - return std::make_pair( - mediaFormat_.format.audio.timeBaseNum, - mediaFormat_.format.audio.timeBaseDen); - } - - void checkStreamDecodeParams(); - - void updateStreamDecodeParams(); - - protected: - int initFormat() override; - std::unique_ptr sampleFrameData() override; - - private: - MediaFormat mediaFormat_; - std::unique_ptr sampler_{nullptr}; -}; diff --git a/torchvision/csrc/cpu/video_reader/FfmpegDecoder.cpp b/torchvision/csrc/cpu/video_reader/FfmpegDecoder.cpp deleted file mode 100644 index fb4d302cc03..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegDecoder.cpp +++ /dev/null @@ -1,412 +0,0 @@ -#include "FfmpegDecoder.h" -#include "FfmpegAudioStream.h" -#include "FfmpegUtil.h" -#include "FfmpegVideoStream.h" - -using namespace std; - -static AVPacket avPkt; - -namespace { - -unique_ptr createFfmpegStream( - MediaType type, - AVFormatContext* ctx, - int idx, - MediaFormat& mediaFormat, - double seekFrameMargin) { - enum AVMediaType avType; - CHECK(ffmpeg_util::mapMediaType(type, &avType)); - switch (type) { - case MediaType::TYPE_VIDEO: - return make_unique( - ctx, idx, avType, mediaFormat, seekFrameMargin); - case MediaType::TYPE_AUDIO: - return make_unique( - ctx, idx, avType, mediaFormat, seekFrameMargin); - default: - return nullptr; - } -} - -} // namespace - -FfmpegAvioContext::FfmpegAvioContext() - : workBuffersize_(VIO_BUFFER_SZ), - workBuffer_((uint8_t*)av_malloc(workBuffersize_)), - inputFile_(nullptr), - inputBuffer_(nullptr), - inputBufferSize_(0) {} - -int FfmpegAvioContext::initAVIOContext(const uint8_t* buffer, int64_t size) { - inputBuffer_ = buffer; - inputBufferSize_ = size; - avioCtx_ = avio_alloc_context( - workBuffer_, - workBuffersize_, - 0, - reinterpret_cast(this), - &FfmpegAvioContext::readMemory, - nullptr, // no write function - &FfmpegAvioContext::seekMemory); - return 0; -} - -FfmpegAvioContext::~FfmpegAvioContext() { - /* note: the internal buffer could have changed, and be != workBuffer_ */ - if (avioCtx_) { - av_freep(&avioCtx_->buffer); - av_freep(&avioCtx_); - } else { - av_freep(&workBuffer_); - } - if (inputFile_) { - fclose(inputFile_); - } -} - -int FfmpegAvioContext::read(uint8_t* buf, int buf_size) { - if (inputBuffer_) { - return readMemory(this, buf, buf_size); - } else { - return -1; - } -} - -int FfmpegAvioContext::readMemory(void* opaque, uint8_t* buf, int buf_size) { - FfmpegAvioContext* h = static_cast(opaque); - if (buf_size < 0) { - return -1; - } - - int reminder = h->inputBufferSize_ - h->offset_; - int r = buf_size < reminder ? buf_size : reminder; - if (r < 0) { - return AVERROR_EOF; - } - - memcpy(buf, h->inputBuffer_ + h->offset_, r); - h->offset_ += r; - return r; -} - -int64_t FfmpegAvioContext::seek(int64_t offset, int whence) { - if (inputBuffer_) { - return seekMemory(this, offset, whence); - } else { - return -1; - } -} - -int64_t FfmpegAvioContext::seekMemory( - void* opaque, - int64_t offset, - int whence) { - FfmpegAvioContext* h = static_cast(opaque); - switch (whence) { - case SEEK_CUR: // from current position - h->offset_ += offset; - break; - case SEEK_END: // from eof - h->offset_ = h->inputBufferSize_ + offset; - break; - case SEEK_SET: // from beginning of file - h->offset_ = offset; - break; - case AVSEEK_SIZE: - return h->inputBufferSize_; - } - return h->offset_; -} - -int FfmpegDecoder::init( - const std::string& filename, - bool isDecodeFile, - FfmpegAvioContext& ioctx, - DecoderOutput& decoderOutput) { - cleanUp(); - - int ret = 0; - if (!isDecodeFile) { - formatCtx_ = avformat_alloc_context(); - if (!formatCtx_) { - LOG(ERROR) << "avformat_alloc_context failed"; - return -1; - } - formatCtx_->pb = ioctx.get_avio(); - formatCtx_->flags |= AVFMT_FLAG_CUSTOM_IO; - - // Determining the input format: - int probeSz = AVPROBE_SIZE + AVPROBE_PADDING_SIZE; - uint8_t* probe((uint8_t*)av_malloc(probeSz)); - memset(probe, 0, probeSz); - int len = ioctx.read(probe, probeSz - AVPROBE_PADDING_SIZE); - if (len < probeSz - AVPROBE_PADDING_SIZE) { - LOG(ERROR) << "Insufficient data to determine video format"; - av_freep(&probe); - return -1; - } - // seek back to start of stream - ioctx.seek(0, SEEK_SET); - - unique_ptr probeData(new AVProbeData()); - probeData->buf = probe; - probeData->buf_size = len; - probeData->filename = ""; - // Determine the input-format: - formatCtx_->iformat = av_probe_input_format(probeData.get(), 1); - // this is to avoid the double-free error - if (formatCtx_->iformat == nullptr) { - LOG(ERROR) << "av_probe_input_format fails"; - return -1; - } - VLOG(1) << "av_probe_input_format succeeds"; - av_freep(&probe); - - ret = avformat_open_input(&formatCtx_, "", nullptr, nullptr); - } else { - ret = avformat_open_input(&formatCtx_, filename.c_str(), nullptr, nullptr); - } - - if (ret < 0) { - LOG(ERROR) << "avformat_open_input failed, error: " - << ffmpeg_util::getErrorDesc(ret); - cleanUp(); - return ret; - } - ret = avformat_find_stream_info(formatCtx_, nullptr); - if (ret < 0) { - LOG(ERROR) << "avformat_find_stream_info failed, error: " - << ffmpeg_util::getErrorDesc(ret); - cleanUp(); - return ret; - } - if (!initStreams()) { - LOG(ERROR) << "Cannot activate streams"; - cleanUp(); - return -1; - } - - for (auto& stream : streams_) { - MediaType mediaType = stream.second->getMediaType(); - decoderOutput.initMediaType(mediaType, stream.second->getMediaFormat()); - } - VLOG(1) << "FfmpegDecoder initialized"; - return 0; -} - -int FfmpegDecoder::decodeFile( - unique_ptr params, - const string& fileName, - DecoderOutput& decoderOutput) { - VLOG(1) << "decode file: " << fileName; - FfmpegAvioContext ioctx; - int ret = decodeLoop(std::move(params), fileName, true, ioctx, decoderOutput); - return ret; -} - -int FfmpegDecoder::decodeMemory( - unique_ptr params, - const uint8_t* buffer, - int64_t size, - DecoderOutput& decoderOutput) { - VLOG(1) << "decode video data in memory"; - FfmpegAvioContext ioctx; - int ret = ioctx.initAVIOContext(buffer, size); - if (ret == 0) { - ret = - decodeLoop(std::move(params), string(""), false, ioctx, decoderOutput); - } - return ret; -} - -int FfmpegDecoder::probeFile( - unique_ptr params, - const string& fileName, - DecoderOutput& decoderOutput) { - VLOG(1) << "probe file: " << fileName; - FfmpegAvioContext ioctx; - return probeVideo(std::move(params), fileName, true, ioctx, decoderOutput); -} - -int FfmpegDecoder::probeMemory( - unique_ptr params, - const uint8_t* buffer, - int64_t size, - DecoderOutput& decoderOutput) { - VLOG(1) << "probe video data in memory"; - FfmpegAvioContext ioctx; - int ret = ioctx.initAVIOContext(buffer, size); - if (ret == 0) { - ret = - probeVideo(std::move(params), string(""), false, ioctx, decoderOutput); - } - return ret; -} - -void FfmpegDecoder::cleanUp() { - if (formatCtx_) { - for (auto& stream : streams_) { - // Drain stream buffers. - DecoderOutput decoderOutput; - stream.second->flush(1, decoderOutput); - stream.second.reset(); - } - streams_.clear(); - avformat_close_input(&formatCtx_); - } -} - -FfmpegStream* FfmpegDecoder::findStreamByIndex(int streamIndex) const { - auto it = streams_.find(streamIndex); - return it != streams_.end() ? it->second.get() : nullptr; -} - -/* -Reference implementation: -https://ffmpeg.org/doxygen/3.4/demuxing_decoding_8c-example.html -*/ -int FfmpegDecoder::decodeLoop( - unique_ptr params, - const std::string& filename, - bool isDecodeFile, - FfmpegAvioContext& ioctx, - DecoderOutput& decoderOutput) { - params_ = std::move(params); - - int ret = init(filename, isDecodeFile, ioctx, decoderOutput); - if (ret < 0) { - return ret; - } - // init package - av_init_packet(&avPkt); - avPkt.data = nullptr; - avPkt.size = 0; - - int result = 0; - bool ptsInRange = true; - while (ptsInRange) { - result = av_read_frame(formatCtx_, &avPkt); - if (result == AVERROR(EAGAIN)) { - VLOG(1) << "Decoder is busy"; - ret = 0; - break; - } else if (result == AVERROR_EOF) { - VLOG(1) << "Stream decoding is completed"; - ret = 0; - break; - } else if (result < 0) { - VLOG(1) << "av_read_frame fails. Break decoder loop. Error: " - << ffmpeg_util::getErrorDesc(result); - ret = result; - break; - } - - ret = 0; - auto stream = findStreamByIndex(avPkt.stream_index); - if (stream == nullptr) { - // the packet is from a stream the caller is not interested. Ignore it - VLOG(2) << "avPkt ignored. stream index: " << avPkt.stream_index; - // Need to free the memory of AVPacket. Otherwise, memory leak happens - av_packet_unref(&avPkt); - continue; - } - - do { - result = stream->sendPacket(&avPkt); - if (result == AVERROR(EAGAIN)) { - VLOG(2) << "avcodec_send_packet returns AVERROR(EAGAIN)"; - // start to recevie available frames from internal buffer - stream->receiveAvailFrames(params_->getPtsOnly, decoderOutput); - if (isPtsExceedRange()) { - // exit the most-outer while loop - VLOG(1) << "In all streams, exceed the end pts. Exit decoding loop"; - ret = 0; - ptsInRange = false; - break; - } - } else if (result < 0) { - LOG(WARNING) << "avcodec_send_packet failed. Error: " - << ffmpeg_util::getErrorDesc(result); - ret = result; - break; - } else { - VLOG(2) << "avcodec_send_packet succeeds"; - // succeed. Read the next AVPacket and send out it - break; - } - } while (ptsInRange); - // Need to free the memory of AVPacket. Otherwise, memory leak happens - av_packet_unref(&avPkt); - } - /* flush cached frames */ - flushStreams(decoderOutput); - return ret; -} - -int FfmpegDecoder::probeVideo( - unique_ptr params, - const std::string& filename, - bool isDecodeFile, - FfmpegAvioContext& ioctx, - DecoderOutput& decoderOutput) { - params_ = std::move(params); - return init(filename, isDecodeFile, ioctx, decoderOutput); -} - -bool FfmpegDecoder::initStreams() { - for (auto it = params_->formats.begin(); it != params_->formats.end(); ++it) { - AVMediaType mediaType; - if (!ffmpeg_util::mapMediaType(it->first, &mediaType)) { - LOG(ERROR) << "Unknown media type: " << it->first; - return false; - } - int streamIdx = - av_find_best_stream(formatCtx_, mediaType, -1, -1, nullptr, 0); - - if (streamIdx >= 0) { - VLOG(2) << "find stream index: " << streamIdx; - auto stream = createFfmpegStream( - it->first, - formatCtx_, - streamIdx, - it->second, - params_->seekFrameMargin); - - CHECK(stream); - if (stream->openCodecContext() < 0) { - LOG(ERROR) << "Cannot open codec. Stream index: " << streamIdx; - return false; - } - streams_.emplace(streamIdx, move(stream)); - } else { - VLOG(1) << "Cannot open find stream of type " << it->first; - } - } - // Seek frames in each stream - int ret = 0; - for (auto& stream : streams_) { - auto startPts = stream.second->getStartPts(); - VLOG(1) << "stream: " << stream.first << " startPts: " << startPts; - if (startPts > 0 && (ret = stream.second->seekFrame(startPts)) < 0) { - LOG(WARNING) << "seekFrame in stream fails"; - return false; - } - } - VLOG(1) << "initStreams succeeds"; - return true; -} - -bool FfmpegDecoder::isPtsExceedRange() { - bool exceed = true; - for (auto& stream : streams_) { - exceed = exceed && stream.second->isFramePtsExceedRange(); - } - return exceed; -} - -void FfmpegDecoder::flushStreams(DecoderOutput& decoderOutput) { - for (auto& stream : streams_) { - stream.second->flush(params_->getPtsOnly, decoderOutput); - } -} diff --git a/torchvision/csrc/cpu/video_reader/FfmpegDecoder.h b/torchvision/csrc/cpu/video_reader/FfmpegDecoder.h deleted file mode 100644 index a0a564a4214..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegDecoder.h +++ /dev/null @@ -1,127 +0,0 @@ -#pragma once - -#include -#include - -#include "FfmpegHeaders.h" -#include "FfmpegStream.h" -#include "Interface.h" - -#define VIO_BUFFER_SZ 81920 -#define AVPROBE_SIZE 8192 - -class DecoderParameters { - public: - std::unordered_map formats; - // av_seek_frame is imprecise so seek to a timestamp earlier by a margin - // The unit of margin is second - double seekFrameMargin{1.0}; - // When getPtsOnly is set to 1, we only get pts of each frame and don not - // output frame data. It will be much faster - int64_t getPtsOnly{0}; -}; - -class FfmpegAvioContext { - public: - FfmpegAvioContext(); - - int initAVIOContext(const uint8_t* buffer, int64_t size); - - ~FfmpegAvioContext(); - - int read(uint8_t* buf, int buf_size); - - static int readMemory(void* opaque, uint8_t* buf, int buf_size); - - int64_t seek(int64_t offset, int whence); - - static int64_t seekMemory(void* opaque, int64_t offset, int whence); - - AVIOContext* get_avio() { - return avioCtx_; - } - - private: - int workBuffersize_; - uint8_t* workBuffer_; - // for file mode - FILE* inputFile_; - // for memory mode - const uint8_t* inputBuffer_; - int inputBufferSize_; - int offset_ = 0; - - AVIOContext* avioCtx_{nullptr}; -}; - -class FfmpegDecoder { - public: - FfmpegDecoder() { - av_register_all(); - } - ~FfmpegDecoder() { - cleanUp(); - } - // return 0 on success - // return negative number on failure - int decodeFile( - std::unique_ptr params, - const std::string& filename, - DecoderOutput& decoderOutput); - // return 0 on success - // return negative number on failure - int decodeMemory( - std::unique_ptr params, - const uint8_t* buffer, - int64_t size, - DecoderOutput& decoderOutput); - // return 0 on success - // return negative number on failure - int probeFile( - std::unique_ptr params, - const std::string& filename, - DecoderOutput& decoderOutput); - // return 0 on success - // return negative number on failure - int probeMemory( - std::unique_ptr params, - const uint8_t* buffer, - int64_t size, - DecoderOutput& decoderOutput); - - void cleanUp(); - - private: - FfmpegStream* findStreamByIndex(int streamIndex) const; - - int init( - const std::string& filename, - bool isDecodeFile, - FfmpegAvioContext& ioctx, - DecoderOutput& decoderOutput); - // return 0 on success - // return negative number on failure - int decodeLoop( - std::unique_ptr params, - const std::string& filename, - bool isDecodeFile, - FfmpegAvioContext& ioctx, - DecoderOutput& decoderOutput); - - int probeVideo( - std::unique_ptr params, - const std::string& filename, - bool isDecodeFile, - FfmpegAvioContext& ioctx, - DecoderOutput& decoderOutput); - - bool initStreams(); - - void flushStreams(DecoderOutput& decoderOutput); - // whether in all streams, the pts of most recent frame exceeds range - bool isPtsExceedRange(); - - std::unordered_map> streams_; - AVFormatContext* formatCtx_{nullptr}; - std::unique_ptr params_{nullptr}; -}; diff --git a/torchvision/csrc/cpu/video_reader/FfmpegHeaders.h b/torchvision/csrc/cpu/video_reader/FfmpegHeaders.h deleted file mode 100644 index ff26aa30a8d..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegHeaders.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -extern "C" { -#include -#include -#include -#include -#include -#include -#include -#include -#include -} diff --git a/torchvision/csrc/cpu/video_reader/FfmpegSampler.h b/torchvision/csrc/cpu/video_reader/FfmpegSampler.h deleted file mode 100644 index 3d00be3486f..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegSampler.h +++ /dev/null @@ -1,16 +0,0 @@ -#pragma once - -#include "FfmpegHeaders.h" -#include "Interface.h" - -/** - * Class sample data from AVFrame - */ -class FfmpegSampler { - public: - virtual ~FfmpegSampler() = default; - // return 0 on success and negative number on failure - virtual int init() = 0; - // sample from the given frame - virtual std::unique_ptr sample(const AVFrame* frame) = 0; -}; diff --git a/torchvision/csrc/cpu/video_reader/FfmpegStream.cpp b/torchvision/csrc/cpu/video_reader/FfmpegStream.cpp deleted file mode 100644 index b745170baf4..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegStream.cpp +++ /dev/null @@ -1,188 +0,0 @@ -#include "FfmpegStream.h" -#include "FfmpegUtil.h" - -using namespace std; - -// (TODO) Currently, disable the use of refCount -static int refCount = 0; - -FfmpegStream::FfmpegStream( - AVFormatContext* inputCtx, - int index, - enum AVMediaType avMediaType, - double seekFrameMargin) - : inputCtx_(inputCtx), - index_(index), - avMediaType_(avMediaType), - seekFrameMargin_(seekFrameMargin) {} - -FfmpegStream::~FfmpegStream() { - if (frame_) { - av_frame_free(&frame_); - } - avcodec_free_context(&codecCtx_); -} - -int FfmpegStream::openCodecContext() { - VLOG(2) << "stream start_time: " << inputCtx_->streams[index_]->start_time; - - auto typeString = av_get_media_type_string(avMediaType_); - AVStream* st = inputCtx_->streams[index_]; - auto codec_id = st->codecpar->codec_id; - VLOG(1) << "codec_id: " << codec_id; - AVCodec* codec = avcodec_find_decoder(codec_id); - if (!codec) { - LOG(ERROR) << "avcodec_find_decoder failed for codec_id: " << int(codec_id); - return AVERROR(EINVAL); - } - VLOG(1) << "Succeed to find decoder"; - - codecCtx_ = avcodec_alloc_context3(codec); - if (!codecCtx_) { - LOG(ERROR) << "avcodec_alloc_context3 fails"; - return AVERROR(ENOMEM); - } - - int ret; - /* Copy codec parameters from input stream to output codec context */ - if ((ret = avcodec_parameters_to_context(codecCtx_, st->codecpar)) < 0) { - LOG(ERROR) << "Failed to copy " << typeString - << " codec parameters to decoder context"; - return ret; - } - - AVDictionary* opts = nullptr; - av_dict_set(&opts, "refcounted_frames", refCount ? "1" : "0", 0); - - // after avcodec_open2, value of codecCtx_->time_base is NOT meaningful - // But inputCtx_->streams[index_]->time_base has meaningful values - if ((ret = avcodec_open2(codecCtx_, codec, &opts)) < 0) { - LOG(ERROR) << "avcodec_open2 failed. " << ffmpeg_util::getErrorDesc(ret); - return ret; - } - VLOG(1) << "Succeed to open codec"; - - frame_ = av_frame_alloc(); - return initFormat(); -} - -unique_ptr FfmpegStream::getFrameData(int getPtsOnly) { - if (!codecCtx_) { - LOG(ERROR) << "Codec is not initialized"; - return nullptr; - } - if (getPtsOnly) { - unique_ptr decodedFrame = make_unique(); - decodedFrame->pts_ = frame_->pts; - return decodedFrame; - } else { - unique_ptr decodedFrame = sampleFrameData(); - if (decodedFrame) { - decodedFrame->pts_ = frame_->pts; - } - return decodedFrame; - } -} - -void FfmpegStream::flush(int getPtsOnly, DecoderOutput& decoderOutput) { - VLOG(1) << "Media Type: " << getMediaType() << ", flush stream."; - // need to receive frames before entering draining mode - receiveAvailFrames(getPtsOnly, decoderOutput); - - VLOG(2) << "send nullptr packet"; - sendPacket(nullptr); - // receive remaining frames after entering draining mode - receiveAvailFrames(getPtsOnly, decoderOutput); - - avcodec_flush_buffers(codecCtx_); -} - -bool FfmpegStream::isFramePtsInRange() { - CHECK(frame_); - auto pts = frame_->pts; - auto startPts = this->getStartPts(); - auto endPts = this->getEndPts(); - VLOG(2) << "isPtsInRange. pts: " << pts << ", startPts: " << startPts - << ", endPts: " << endPts; - return (pts == AV_NOPTS_VALUE) || - (pts >= startPts && (endPts >= 0 ? pts <= endPts : true)); -} - -bool FfmpegStream::isFramePtsExceedRange() { - if (frame_) { - auto endPts = this->getEndPts(); - VLOG(2) << "isFramePtsExceedRange. last_pts_: " << last_pts_ - << ", endPts: " << endPts; - return endPts >= 0 ? last_pts_ >= endPts : false; - } else { - return true; - } -} - -// seek a frame -int FfmpegStream::seekFrame(int64_t seekPts) { - // translate margin from second to pts - int64_t margin = (int64_t)( - seekFrameMargin_ * (double)inputCtx_->streams[index_]->time_base.den / - (double)inputCtx_->streams[index_]->time_base.num); - int64_t real_seekPts = (seekPts - margin) > 0 ? (seekPts - margin) : 0; - VLOG(2) << "seek margin: " << margin; - VLOG(2) << "real seekPts: " << real_seekPts; - int ret = av_seek_frame( - inputCtx_, - index_, - (seekPts - margin) > 0 ? (seekPts - margin) : 0, - AVSEEK_FLAG_BACKWARD); - if (ret < 0) { - LOG(WARNING) << "av_seek_frame fails. Stream index: " << index_; - return ret; - } - return 0; -} - -// send/receive encoding and decoding API overview -// https://ffmpeg.org/doxygen/3.4/group__lavc__encdec.html -int FfmpegStream::sendPacket(const AVPacket* packet) { - return avcodec_send_packet(codecCtx_, packet); -} - -int FfmpegStream::receiveFrame() { - int ret = avcodec_receive_frame(codecCtx_, frame_); - if (ret >= 0) { - // succeed - frame_->pts = av_frame_get_best_effort_timestamp(frame_); - if (frame_->pts == AV_NOPTS_VALUE) { - // Trick: if we can not figure out pts, we just set it to be (last_pts + - // 1) - frame_->pts = last_pts_ + 1; - } - last_pts_ = frame_->pts; - - VLOG(2) << "avcodec_receive_frame succeed"; - } else if (ret == AVERROR(EAGAIN)) { - VLOG(2) << "avcodec_receive_frame fails and returns AVERROR(EAGAIN). "; - } else if (ret == AVERROR_EOF) { - // no more frame to read - VLOG(2) << "avcodec_receive_frame returns AVERROR_EOF"; - } else { - LOG(WARNING) << "avcodec_receive_frame failed. Error: " - << ffmpeg_util::getErrorDesc(ret); - } - return ret; -} - -void FfmpegStream::receiveAvailFrames( - int getPtsOnly, - DecoderOutput& decoderOutput) { - int result = 0; - while ((result = receiveFrame()) >= 0) { - unique_ptr decodedFrame = getFrameData(getPtsOnly); - - if (decodedFrame && - ((!getPtsOnly && decodedFrame->frameSize_ > 0) || getPtsOnly)) { - if (isFramePtsInRange()) { - decoderOutput.addMediaFrame(getMediaType(), std::move(decodedFrame)); - } - } // end-if - } // end-while -} diff --git a/torchvision/csrc/cpu/video_reader/FfmpegStream.h b/torchvision/csrc/cpu/video_reader/FfmpegStream.h deleted file mode 100644 index b66a36977ec..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegStream.h +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. -#pragma once - -#include -#include -#include -#include "FfmpegHeaders.h" -#include "Interface.h" - -/* -Class uses FFMPEG library to decode one media stream (audio or video). -*/ -class FfmpegStream { - public: - FfmpegStream( - AVFormatContext* inputCtx, - int index, - enum AVMediaType avMediaType, - double seekFrameMargin); - virtual ~FfmpegStream(); - - // returns 0 - on success or negative error - int openCodecContext(); - // returns stream index - int getIndex() const { - return index_; - } - // returns number decoded/sampled bytes - std::unique_ptr getFrameData(int getPtsOnly); - // flush the stream at the end of decoding. - // Return 0 on success and -1 when cache is drained - void flush(int getPtsOnly, DecoderOutput& decoderOutput); - // seek a frame - int seekFrame(int64_t ts); - // send an AVPacket - int sendPacket(const AVPacket* packet); - // receive AVFrame - int receiveFrame(); - // receive all available frames from the internal buffer - void receiveAvailFrames(int getPtsOnly, DecoderOutput& decoderOutput); - // return media type - virtual MediaType getMediaType() const = 0; - // return media format - virtual FormatUnion getMediaFormat() const = 0; - // return start presentation timestamp - virtual int64_t getStartPts() const = 0; - // return end presentation timestamp - virtual int64_t getEndPts() const = 0; - // is the pts of most recent frame within range? - bool isFramePtsInRange(); - // does the pts of most recent frame exceed range? - bool isFramePtsExceedRange(); - - protected: - virtual int initFormat() = 0; - // returns a decoded frame - virtual std::unique_ptr sampleFrameData() = 0; - - protected: - AVFormatContext* const inputCtx_; - const int index_; - enum AVMediaType avMediaType_; - - AVCodecContext* codecCtx_{nullptr}; - AVFrame* frame_{nullptr}; - // pts of last decoded frame - int64_t last_pts_{0}; - double seekFrameMargin_{1.0}; -}; diff --git a/torchvision/csrc/cpu/video_reader/FfmpegUtil.cpp b/torchvision/csrc/cpu/video_reader/FfmpegUtil.cpp deleted file mode 100644 index 9e804ee67c0..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegUtil.cpp +++ /dev/null @@ -1,111 +0,0 @@ -#include "FfmpegUtil.h" - -using namespace std; - -namespace ffmpeg_util { - -bool mapFfmpegType(AVMediaType media, MediaType* type) { - switch (media) { - case AVMEDIA_TYPE_VIDEO: - *type = MediaType::TYPE_VIDEO; - return true; - case AVMEDIA_TYPE_AUDIO: - *type = MediaType::TYPE_AUDIO; - return true; - default: - return false; - } -} - -bool mapMediaType(MediaType type, AVMediaType* media) { - switch (type) { - case MediaType::TYPE_VIDEO: - *media = AVMEDIA_TYPE_VIDEO; - return true; - case MediaType::TYPE_AUDIO: - *media = AVMEDIA_TYPE_AUDIO; - return true; - default: - return false; - } -} - -void setFormatDimensions( - int& destW, - int& destH, - int userW, - int userH, - int srcW, - int srcH, - int minDimension) { - // rounding rules - // int -> double -> round - // round up if fraction is >= 0.5 or round down if fraction is < 0.5 - // int result = double(value) + 0.5 - // here we rounding double to int according to the above rule - if (userW == 0 && userH == 0) { - if (minDimension > 0) { // #2 - if (srcW > srcH) { - // landscape - destH = minDimension; - destW = round(double(srcW * minDimension) / srcH); - } else { - // portrait - destW = minDimension; - destH = round(double(srcH * minDimension) / srcW); - } - } else { // #1 - destW = srcW; - destH = srcH; - } - } else if (userW != 0 && userH == 0) { // #3 - destW = userW; - destH = round(double(srcH * userW) / srcW); - } else if (userW == 0 && userH != 0) { // #4 - destW = round(double(srcW * userH) / srcH); - destH = userH; - } else { - // userW != 0 && userH != 0. #5 - destW = userW; - destH = userH; - } - // prevent zeros - destW = std::max(destW, 1); - destH = std::max(destH, 1); -} - -bool validateVideoFormat(const VideoFormat& f) { - /* - Valid parameters values for decoder - ___________________________________________________ - | W | H | minDimension | algorithm | - |_________________________________________________| - | 0 | 0 | 0 | original | - |_________________________________________________| - | 0 | 0 | >0 |scale to min dimension| - |_____|_____|____________________________________ | - | >0 | 0 | 0 | scale keeping W | - |_________________________________________________| - | 0 | >0 | 0 | scale keeping H | - |_________________________________________________| - | >0 | >0 | 0 | stretch/scale | - |_________________________________________________| - - */ - return (f.width == 0 && f.height == 0) || // #1 and #2 - (f.width != 0 && f.height != 0 && f.minDimension == 0) || // # 5 - (((f.width != 0 && f.height == 0) || // #3 and #4 - (f.width == 0 && f.height != 0)) && - f.minDimension == 0); -} - -string getErrorDesc(int errnum) { - array buffer; - if (av_strerror(errnum, buffer.data(), buffer.size()) < 0) { - return string("Unknown error code"); - } - buffer.back() = 0; - return string(buffer.data()); -} - -} // namespace ffmpeg_util diff --git a/torchvision/csrc/cpu/video_reader/FfmpegUtil.h b/torchvision/csrc/cpu/video_reader/FfmpegUtil.h deleted file mode 100644 index 9f42eb53c97..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegUtil.h +++ /dev/null @@ -1,27 +0,0 @@ -#pragma once - -#include -#include -#include "FfmpegHeaders.h" -#include "Interface.h" - -namespace ffmpeg_util { - -bool mapFfmpegType(AVMediaType media, enum MediaType* type); - -bool mapMediaType(MediaType type, enum AVMediaType* media); - -void setFormatDimensions( - int& destW, - int& destH, - int userW, - int userH, - int srcW, - int srcH, - int minDimension); - -bool validateVideoFormat(const VideoFormat& f); - -std::string getErrorDesc(int errnum); - -} // namespace ffmpeg_util diff --git a/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.cpp b/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.cpp deleted file mode 100644 index d87b3104dd5..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.cpp +++ /dev/null @@ -1,90 +0,0 @@ -#include "FfmpegVideoSampler.h" -#include "FfmpegUtil.h" - -using namespace std; - -FfmpegVideoSampler::FfmpegVideoSampler( - const VideoFormat& in, - const VideoFormat& out, - int swsFlags) - : inFormat_(in), outFormat_(out), swsFlags_(swsFlags) {} - -FfmpegVideoSampler::~FfmpegVideoSampler() { - if (scaleContext_) { - sws_freeContext(scaleContext_); - scaleContext_ = nullptr; - } -} - -int FfmpegVideoSampler::init() { - VLOG(1) << "Input format: width " << inFormat_.width << ", height " - << inFormat_.height << ", format " << inFormat_.format - << ", minDimension " << inFormat_.minDimension; - VLOG(1) << "Scale format: width " << outFormat_.width << ", height " - << outFormat_.height << ", format " << outFormat_.format - << ", minDimension " << outFormat_.minDimension; - - scaleContext_ = sws_getContext( - inFormat_.width, - inFormat_.height, - (AVPixelFormat)inFormat_.format, - outFormat_.width, - outFormat_.height, - static_cast(outFormat_.format), - swsFlags_, - nullptr, - nullptr, - nullptr); - if (scaleContext_) { - return 0; - } else { - return -1; - } -} - -int32_t FfmpegVideoSampler::getImageBytes() const { - return av_image_get_buffer_size( - (AVPixelFormat)outFormat_.format, outFormat_.width, outFormat_.height, 1); -} - -// https://ffmpeg.org/doxygen/3.4/scaling_video_8c-example.html#a10 -unique_ptr FfmpegVideoSampler::sample(const AVFrame* frame) { - if (!frame) { - return nullptr; // no flush for videos - } - // scaled and cropped image - auto outImageSize = getImageBytes(); - AvDataPtr frameData(static_cast(av_malloc(outImageSize))); - - uint8_t* scalePlanes[4] = {nullptr}; - int scaleLines[4] = {0}; - - int result; - if ((result = av_image_fill_arrays( - scalePlanes, - scaleLines, - frameData.get(), - static_cast(outFormat_.format), - outFormat_.width, - outFormat_.height, - 1)) < 0) { - LOG(ERROR) << "av_image_fill_arrays failed, err: " - << ffmpeg_util::getErrorDesc(result); - return nullptr; - } - - if ((result = sws_scale( - scaleContext_, - frame->data, - frame->linesize, - 0, - inFormat_.height, - scalePlanes, - scaleLines)) < 0) { - LOG(ERROR) << "sws_scale failed, err: " - << ffmpeg_util::getErrorDesc(result); - return nullptr; - } - - return make_unique(std::move(frameData), outImageSize, 0); -} diff --git a/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.h b/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.h deleted file mode 100644 index 1fd6862f537..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegVideoSampler.h +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once - -#include "FfmpegSampler.h" - -/** - * Class transcode video frames from one format into another - */ - -class FfmpegVideoSampler : public FfmpegSampler { - public: - explicit FfmpegVideoSampler( - const VideoFormat& in, - const VideoFormat& out, - int swsFlags = SWS_AREA); - ~FfmpegVideoSampler() override; - - int init() override; - - int32_t getImageBytes() const; - // returns number of bytes of the sampled data - std::unique_ptr sample(const AVFrame* frame) override; - - const VideoFormat& getInFormat() const { - return inFormat_; - } - - private: - VideoFormat inFormat_; - VideoFormat outFormat_; - int swsFlags_; - SwsContext* scaleContext_{nullptr}; -}; diff --git a/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.cpp b/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.cpp deleted file mode 100644 index 7a429249a71..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.cpp +++ /dev/null @@ -1,115 +0,0 @@ -#include "FfmpegVideoStream.h" -#include "FfmpegUtil.h" - -using namespace std; - -namespace { - -bool operator==(const VideoFormat& x, const AVFrame& y) { - return x.width == y.width && x.height == y.height && - x.format == static_cast(y.format); -} - -VideoFormat toVideoFormat(const AVFrame& frame) { - VideoFormat videoFormat; - videoFormat.width = frame.width; - videoFormat.height = frame.height; - videoFormat.format = static_cast(frame.format); - - return videoFormat; -} - -} // namespace - -FfmpegVideoStream::FfmpegVideoStream( - AVFormatContext* inputCtx, - int index, - enum AVMediaType avMediaType, - MediaFormat mediaFormat, - double seekFrameMargin) - : FfmpegStream(inputCtx, index, avMediaType, seekFrameMargin), - mediaFormat_(mediaFormat) {} - -FfmpegVideoStream::~FfmpegVideoStream() {} - -void FfmpegVideoStream::checkStreamDecodeParams() { - auto timeBase = getTimeBase(); - if (timeBase.first > 0) { - CHECK_EQ(timeBase.first, inputCtx_->streams[index_]->time_base.num); - CHECK_EQ(timeBase.second, inputCtx_->streams[index_]->time_base.den); - } -} - -void FfmpegVideoStream::updateStreamDecodeParams() { - auto timeBase = getTimeBase(); - if (timeBase.first == 0) { - mediaFormat_.format.video.timeBaseNum = - inputCtx_->streams[index_]->time_base.num; - mediaFormat_.format.video.timeBaseDen = - inputCtx_->streams[index_]->time_base.den; - } - mediaFormat_.format.video.duration = inputCtx_->streams[index_]->duration; -} - -int FfmpegVideoStream::initFormat() { - // set output format - VideoFormat& format = mediaFormat_.format.video; - if (!ffmpeg_util::validateVideoFormat(format)) { - LOG(ERROR) << "Invalid video format"; - return -1; - } - - format.fps = av_q2d( - av_guess_frame_rate(inputCtx_, inputCtx_->streams[index_], nullptr)); - - // keep aspect ratio - ffmpeg_util::setFormatDimensions( - format.width, - format.height, - format.width, - format.height, - codecCtx_->width, - codecCtx_->height, - format.minDimension); - - VLOG(1) << "After adjusting, video format" - << ", width: " << format.width << ", height: " << format.height - << ", format: " << format.format - << ", minDimension: " << format.minDimension; - - if (format.format == AV_PIX_FMT_NONE) { - format.format = codecCtx_->pix_fmt; - VLOG(1) << "Set pixel format: " << format.format; - } - - checkStreamDecodeParams(); - - updateStreamDecodeParams(); - - return format.width != 0 && format.height != 0 && - format.format != AV_PIX_FMT_NONE - ? 0 - : -1; -} - -unique_ptr FfmpegVideoStream::sampleFrameData() { - VideoFormat& format = mediaFormat_.format.video; - if (!sampler_ || !(sampler_->getInFormat() == *frame_)) { - VideoFormat newInFormat = toVideoFormat(*frame_); - sampler_ = make_unique(newInFormat, format, SWS_AREA); - VLOG(1) << "Set input video sampler format" - << ", width: " << newInFormat.width - << ", height: " << newInFormat.height - << ", format: " << newInFormat.format - << " : output video sampler format" - << ", width: " << format.width << ", height: " << format.height - << ", format: " << format.format - << ", minDimension: " << format.minDimension; - int ret = sampler_->init(); - if (ret < 0) { - VLOG(1) << "Fail to initialize video sampler"; - return nullptr; - } - } - return sampler_->sample(frame_); -} diff --git a/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.h b/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.h deleted file mode 100644 index 9bfbc9f665b..00000000000 --- a/torchvision/csrc/cpu/video_reader/FfmpegVideoStream.h +++ /dev/null @@ -1,54 +0,0 @@ -#pragma once - -#include -#include "FfmpegStream.h" -#include "FfmpegVideoSampler.h" - -/** - * Class uses FFMPEG library to decode one video stream. - */ -class FfmpegVideoStream : public FfmpegStream { - public: - explicit FfmpegVideoStream( - AVFormatContext* inputCtx, - int index, - enum AVMediaType avMediaType, - MediaFormat mediaFormat, - double seekFrameMargin); - - ~FfmpegVideoStream() override; - - // FfmpegStream overrides - MediaType getMediaType() const override { - return MediaType::TYPE_VIDEO; - } - - FormatUnion getMediaFormat() const override { - return mediaFormat_.format; - } - - int64_t getStartPts() const override { - return mediaFormat_.format.video.startPts; - } - int64_t getEndPts() const override { - return mediaFormat_.format.video.endPts; - } - // return numerator and denominator of time base - std::pair getTimeBase() const { - return std::make_pair( - mediaFormat_.format.video.timeBaseNum, - mediaFormat_.format.video.timeBaseDen); - } - - void checkStreamDecodeParams(); - - void updateStreamDecodeParams(); - - protected: - int initFormat() override; - std::unique_ptr sampleFrameData() override; - - private: - MediaFormat mediaFormat_; - std::unique_ptr sampler_{nullptr}; -}; diff --git a/torchvision/csrc/cpu/video_reader/Interface.cpp b/torchvision/csrc/cpu/video_reader/Interface.cpp deleted file mode 100644 index 0ec9f155821..00000000000 --- a/torchvision/csrc/cpu/video_reader/Interface.cpp +++ /dev/null @@ -1,22 +0,0 @@ -#include "Interface.h" - -void DecoderOutput::initMediaType(MediaType mediaType, FormatUnion format) { - MediaData mediaData(format); - media_data_.emplace(mediaType, std::move(mediaData)); -} - -void DecoderOutput::addMediaFrame( - MediaType mediaType, - std::unique_ptr frame) { - if (media_data_.find(mediaType) != media_data_.end()) { - VLOG(1) << "media type: " << mediaType - << " add frame with pts: " << frame->pts_; - media_data_[mediaType].frames_.push_back(std::move(frame)); - } else { - VLOG(1) << "media type: " << mediaType << " not found. Skip the frame."; - } -} - -void DecoderOutput::clear() { - media_data_.clear(); -} diff --git a/torchvision/csrc/cpu/video_reader/Interface.h b/torchvision/csrc/cpu/video_reader/Interface.h deleted file mode 100644 index e137008ce7b..00000000000 --- a/torchvision/csrc/cpu/video_reader/Interface.h +++ /dev/null @@ -1,127 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -extern "C" { - -#include -#include -void av_free(void* ptr); -} - -struct avDeleter { - void operator()(uint8_t* p) const { - av_free(p); - } -}; - -const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24; -const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT; - -using AvDataPtr = std::unique_ptr; - -enum MediaType : uint32_t { - TYPE_VIDEO = 1, - TYPE_AUDIO = 2, -}; - -struct EnumClassHash { - template - uint32_t operator()(T t) const { - return static_cast(t); - } -}; - -struct VideoFormat { - // fields are initialized for the auto detection - // caller can specify some/all of field values if specific output is desirable - - int width{0}; // width in pixels - int height{0}; // height in pixels - int minDimension{0}; // choose min dimension and rescale accordingly - // Output image pixel format. data type AVPixelFormat - AVPixelFormat format{defaultVideoPixelFormat}; // type AVPixelFormat - int64_t startPts{0}, endPts{0}; // Start and end presentation timestamp - int timeBaseNum{0}; - int timeBaseDen{1}; // numerator and denominator of time base - float fps{0.0}; - int64_t duration{0}; // duration of the stream, in stream time base -}; - -struct AudioFormat { - // fields are initialized for the auto detection - // caller can specify some/all of field values if specific output is desirable - - int samples{0}; // number samples per second (frequency) - int channels{0}; // number of channels - AVSampleFormat format{defaultAudioSampleFormat}; // type AVSampleFormat - int64_t startPts{0}, endPts{0}; // Start and end presentation timestamp - int timeBaseNum{0}; - int timeBaseDen{1}; // numerator and denominator of time base - int64_t duration{0}; // duration of the stream, in stream time base -}; - -union FormatUnion { - FormatUnion() {} - VideoFormat video; - AudioFormat audio; -}; - -struct MediaFormat { - MediaFormat() {} - - MediaFormat(const MediaFormat& mediaFormat) : type(mediaFormat.type) { - if (type == MediaType::TYPE_VIDEO) { - format.video = mediaFormat.format.video; - } else if (type == MediaType::TYPE_AUDIO) { - format.audio = mediaFormat.format.audio; - } - } - - MediaFormat(MediaType mediaType) : type(mediaType) { - if (mediaType == MediaType::TYPE_VIDEO) { - format.video = VideoFormat(); - } else if (mediaType == MediaType::TYPE_AUDIO) { - format.audio = AudioFormat(); - } - } - // media type - MediaType type; - // format data - FormatUnion format; -}; - -class DecodedFrame { - public: - explicit DecodedFrame() : frame_(nullptr), frameSize_(0), pts_(0) {} - explicit DecodedFrame(AvDataPtr frame, int frameSize, int64_t pts) - : frame_(std::move(frame)), frameSize_(frameSize), pts_(pts) {} - AvDataPtr frame_{nullptr}; - int frameSize_{0}; - int64_t pts_{0}; -}; - -struct MediaData { - MediaData() {} - MediaData(FormatUnion format) : format_(format) {} - FormatUnion format_; - std::vector> frames_; -}; - -class DecoderOutput { - public: - explicit DecoderOutput() {} - - ~DecoderOutput() {} - - void initMediaType(MediaType mediaType, FormatUnion format); - - void addMediaFrame(MediaType mediaType, std::unique_ptr frame); - - void clear(); - - std::unordered_map media_data_; -}; diff --git a/torchvision/csrc/cpu/video_reader/VideoReader.cpp b/torchvision/csrc/cpu/video_reader/VideoReader.cpp index dfe7f46bf39..8cc0c79056b 100644 --- a/torchvision/csrc/cpu/video_reader/VideoReader.cpp +++ b/torchvision/csrc/cpu/video_reader/VideoReader.cpp @@ -3,11 +3,11 @@ #include #include #include -#include "FfmpegDecoder.h" -#include "FfmpegHeaders.h" -#include "util.h" +#include "memory_buffer.h" +#include "sync_decoder.h" using namespace std; +using namespace ffmpeg; // If we are in a Windows environment, we need to define // initialization functions for the _custom_ops extension @@ -27,121 +27,140 @@ PyMODINIT_FUNC PyInit_video_reader(void) { namespace video_reader { -class UnknownPixelFormatException : public exception { - const char* what() const throw() override { - return "Unknown pixel format"; - } -}; - -int getChannels(AVPixelFormat format) { - int numChannels = 0; - switch (format) { - case AV_PIX_FMT_BGR24: - case AV_PIX_FMT_RGB24: - numChannels = 3; - break; - default: - LOG(ERROR) << "Unknown format: " << format; - throw UnknownPixelFormatException(); - } - return numChannels; -} +const AVPixelFormat defaultVideoPixelFormat = AV_PIX_FMT_RGB24; +const AVSampleFormat defaultAudioSampleFormat = AV_SAMPLE_FMT_FLT; +const size_t decoderTimeoutMs = 600000; +const size_t timeBaseJitterUs = 100; -void fillVideoTensor( - std::vector>& frames, - torch::Tensor& videoFrame, - torch::Tensor& videoFramePts) { - int frameSize = 0; - if (videoFrame.numel() > 0) { - frameSize = videoFrame.numel() / frames.size(); +DecoderParameters getDecoderParams( + int64_t videoStartUs, + int64_t videoEndUs, + double seekFrameMarginUs, + int64_t getPtsOnly, + int64_t readVideoStream, + int videoWidth, + int videoHeight, + int videoMinDimension, + int64_t readAudioStream, + int audioSamples, + int audioChannels) { + DecoderParameters params; + params.headerOnly = getPtsOnly != 0; + params.seekAccuracy = seekFrameMarginUs; + params.startOffset = videoStartUs; + params.endOffset = videoEndUs; + params.timeoutMs = decoderTimeoutMs; + params.preventStaleness = false; + + if (readVideoStream == 1) { + MediaFormat videoFormat(0); + videoFormat.type = TYPE_VIDEO; + videoFormat.format.video.format = defaultVideoPixelFormat; + videoFormat.format.video.width = videoWidth; + videoFormat.format.video.height = videoHeight; + videoFormat.format.video.minDimension = videoMinDimension; + params.formats.insert(videoFormat); } - int frameCount = 0; + if (readAudioStream == 1) { + MediaFormat audioFormat; + audioFormat.type = TYPE_AUDIO; + audioFormat.format.audio.format = defaultAudioSampleFormat; + audioFormat.format.audio.samples = audioSamples; + audioFormat.format.audio.channels = audioChannels; + params.formats.insert(audioFormat); + } - uint8_t* videoFrameData = - videoFrame.numel() > 0 ? videoFrame.data_ptr() : nullptr; - int64_t* videoFramePtsData = videoFramePts.data_ptr(); + return params; +} - for (size_t i = 0; i < frames.size(); ++i) { - const auto& frame = frames[i]; - if (videoFrameData) { - memcpy( - videoFrameData + (size_t)(frameCount++) * (size_t)frameSize, - frame->frame_.get(), - frameSize * sizeof(uint8_t)); +template +void fillTensor( + std::vector& msgs, + torch::Tensor& frame, + torch::Tensor& framePts, + int64_t num, + int64_t den) { + T* frameData = frame.numel() > 0 ? frame.data_ptr() : nullptr; + int64_t* framePtsData = framePts.data_ptr(); + + for (size_t i = 0; i < msgs.size(); ++i) { + const auto& msg = msgs[i]; + // convert pts into original time_base + AVRational avr = {(int)num, (int)den}; + framePtsData[i] = av_rescale_q(msg.header.pts, AV_TIME_BASE_Q, avr); + VLOG(2) << "PTS type: " << sizeof(T) << ", us: " << msg.header.pts + << ", original: " << framePtsData[i]; + + if (frameData) { + auto size = msg.payload->length(); + memcpy(frameData, msg.payload->data(), size); + frameData += size / sizeof(T); } - videoFramePtsData[i] = frame->pts_; } } -void getVideoMeta( - DecoderOutput& decoderOutput, - int& numFrames, - int& height, - int& width, - int& numChannels) { - auto& videoFrames = decoderOutput.media_data_[TYPE_VIDEO].frames_; - numFrames = videoFrames.size(); - - FormatUnion& videoFormat = decoderOutput.media_data_[TYPE_VIDEO].format_; - height = videoFormat.video.height; - width = videoFormat.video.width; - numChannels = getChannels(videoFormat.video.format); +void fillVideoTensor( + std::vector& msgs, + torch::Tensor& videoFrame, + torch::Tensor& videoFramePts, + int64_t num, + int64_t den) { + fillTensor(msgs, videoFrame, videoFramePts, num, den); } void fillAudioTensor( - std::vector>& frames, + std::vector& msgs, torch::Tensor& audioFrame, - torch::Tensor& audioFramePts) { - if (frames.size() == 0) { - return; - } - - float* audioFrameData = - audioFrame.numel() > 0 ? audioFrame.data_ptr() : nullptr; - CHECK_EQ(audioFramePts.size(0), frames.size()); - int64_t* audioFramePtsData = audioFramePts.data_ptr(); - - int bytesPerSample = av_get_bytes_per_sample(defaultAudioSampleFormat); - - int64_t frameDataOffset = 0; - for (size_t i = 0; i < frames.size(); ++i) { - audioFramePtsData[i] = frames[i]->pts_; - if (audioFrameData) { - memcpy( - audioFrameData + frameDataOffset, - frames[i]->frame_.get(), - frames[i]->frameSize_); - frameDataOffset += (frames[i]->frameSize_ / bytesPerSample); - } - } + torch::Tensor& audioFramePts, + int64_t num, + int64_t den) { + fillTensor(msgs, audioFrame, audioFramePts, num, den); } -void getAudioMeta( - DecoderOutput& decoderOutput, - int64_t& numSamples, - int64_t& channels, - int64_t& numFrames) { - FormatUnion& audioFormat = decoderOutput.media_data_[TYPE_AUDIO].format_; - - channels = audioFormat.audio.channels; - CHECK_EQ(audioFormat.audio.format, AV_SAMPLE_FMT_FLT); - int bytesPerSample = av_get_bytes_per_sample( - static_cast(audioFormat.audio.format)); - - // auto& audioFrames = decoderOutput.media_frames_[TYPE_AUDIO]; - auto& audioFrames = decoderOutput.media_data_[TYPE_AUDIO].frames_; - numFrames = audioFrames.size(); - int64_t frameSizeTotal = 0; - for (auto const& decodedFrame : audioFrames) { - frameSizeTotal += static_cast(decodedFrame->frameSize_); +void offsetsToUs( + double& seekFrameMargin, + int64_t readVideoStream, + int64_t videoStartPts, + int64_t videoEndPts, + int64_t videoTimeBaseNum, + int64_t videoTimeBaseDen, + int64_t readAudioStream, + int64_t audioStartPts, + int64_t audioEndPts, + int64_t audioTimeBaseNum, + int64_t audioTimeBaseDen, + int64_t& videoStartUs, + int64_t& videoEndUs) { + seekFrameMargin *= AV_TIME_BASE; + videoStartUs = 0; + videoEndUs = -1; + + if (readVideoStream) { + AVRational vr = {(int)videoTimeBaseNum, (int)videoTimeBaseDen}; + if (videoStartPts > 0) { + videoStartUs = av_rescale_q(videoStartPts, vr, AV_TIME_BASE_Q); + } + if (videoEndPts > 0) { + // Add jitter to the end of the range to avoid conversion/rounding error. + // Small value 100us won't be enough to select the next frame, but enough + // to compensate rounding error due to the multiple conversions. + videoEndUs = + timeBaseJitterUs + av_rescale_q(videoEndPts, vr, AV_TIME_BASE_Q); + } + } else if (readAudioStream) { + AVRational ar = {(int)audioTimeBaseNum, (int)audioTimeBaseDen}; + if (audioStartPts > 0) { + videoStartUs = av_rescale_q(audioStartPts, ar, AV_TIME_BASE_Q); + } + if (audioEndPts > 0) { + // Add jitter to the end of the range to avoid conversion/rounding error. + // Small value 100us won't be enough to select the next frame, but enough + // to compensate rounding error due to the multiple conversions. + videoEndUs = + timeBaseJitterUs + av_rescale_q(audioEndPts, ar, AV_TIME_BASE_Q); + } } - VLOG(2) << "numFrames: " << numFrames; - VLOG(2) << "frameSizeTotal: " << frameSizeTotal; - VLOG(2) << "channels: " << channels; - VLOG(2) << "bytesPerSample: " << bytesPerSample; - CHECK_EQ(frameSizeTotal % (channels * bytesPerSample), 0); - numSamples = frameSizeTotal / (channels * bytesPerSample); } torch::List readVideo( @@ -165,36 +184,74 @@ torch::List readVideo( int64_t audioEndPts, int64_t audioTimeBaseNum, int64_t audioTimeBaseDen) { - unique_ptr params = util::getDecoderParams( + int64_t videoStartUs, videoEndUs; + + offsetsToUs( seekFrameMargin, - getPtsOnly, readVideoStream, - width, - height, - minDimension, videoStartPts, videoEndPts, videoTimeBaseNum, videoTimeBaseDen, readAudioStream, - audioSamples, - audioChannels, audioStartPts, audioEndPts, audioTimeBaseNum, - audioTimeBaseDen); - - FfmpegDecoder decoder; - DecoderOutput decoderOutput; + audioTimeBaseDen, + videoStartUs, + videoEndUs); + + DecoderParameters params = getDecoderParams( + videoStartUs, // videoStartPts + videoEndUs, // videoEndPts + seekFrameMargin, // seekFrameMargin + getPtsOnly, // getPtsOnly + readVideoStream, // readVideoStream + width, // width + height, // height + minDimension, // minDimension + readAudioStream, // readAudioStream + audioSamples, // audioSamples + audioChannels // audioChannels + ); + SyncDecoder decoder; + std::vector audioMessages, videoMessages; + DecoderInCallback callback = nullptr; if (isReadFile) { - decoder.decodeFile(std::move(params), videoPath, decoderOutput); + params.uri = videoPath; } else { - decoder.decodeMemory( - std::move(params), - input_video.data_ptr(), - input_video.size(0), - decoderOutput); + callback = MemoryBuffer::getCallback( + input_video.data_ptr(), input_video.size(0)); + } + + const auto now = std::chrono::system_clock::now(); + + VLOG(2) << "Video decoding [" << videoPath << "] has started"; + + bool succeeded; + if ((succeeded = decoder.init(params, std::move(callback)))) { + int res; + DecoderOutputMessage msg; + while (0 == (res = decoder.decode(&msg, decoderTimeoutMs))) { + if (msg.header.format.type == TYPE_VIDEO) { + videoMessages.push_back(std::move(msg)); + } + if (msg.header.format.type == TYPE_AUDIO) { + audioMessages.push_back(std::move(msg)); + } + msg.payload.reset(); + } + + decoder.shutdown(); + + const auto then = std::chrono::system_clock::now(); + VLOG(2) << "Video decoding [" << videoPath << "] has finished, " + << std::chrono::duration_cast(then - now) + .count() + << " us"; + } else { + LOG(ERROR) << "Decoder initialization has failed"; } // video section @@ -204,12 +261,21 @@ torch::List readVideo( torch::Tensor videoFps = torch::zeros({0}, torch::kFloat); torch::Tensor videoDuration = torch::zeros({0}, torch::kLong); - if (readVideoStream == 1) { - auto it = decoderOutput.media_data_.find(TYPE_VIDEO); - if (it != decoderOutput.media_data_.end()) { - int numVideoFrames, outHeight, outWidth, numChannels; - getVideoMeta( - decoderOutput, numVideoFrames, outHeight, outWidth, numChannels); + VLOG(2) << "Requested clip" + << ", videoStartPts: " << videoStartPts + << ", videoEndPts: " << videoEndPts + << ", videoStartUs: " << videoStartUs + << ", videoEndUs: " << videoEndUs; + + if (succeeded && readVideoStream == 1) { + if (!videoMessages.empty()) { + const auto& header = videoMessages[0].header; + const auto& media = header.format; + const auto& format = media.format.video; + int numVideoFrames = videoMessages.size(); + int outHeight = format.height; + int outWidth = format.width; + int numChannels = 3; // decoder guarantees the default AV_PIX_FMT_RGB24 if (getPtsOnly == 0) { videoFrame = torch::zeros( @@ -218,23 +284,26 @@ torch::List readVideo( videoFramePts = torch::zeros({numVideoFrames}, torch::kLong); + VLOG(2) << "video duration: " << media.duration << ", fps: " << header.fps + << ", num: " << media.num << ", den: " << media.den + << ", num frames: " << numVideoFrames; + fillVideoTensor( - decoderOutput.media_data_[TYPE_VIDEO].frames_, - videoFrame, - videoFramePts); + videoMessages, videoFrame, videoFramePts, media.num, media.den); videoTimeBase = torch::zeros({2}, torch::kInt); int* videoTimeBaseData = videoTimeBase.data_ptr(); - videoTimeBaseData[0] = it->second.format_.video.timeBaseNum; - videoTimeBaseData[1] = it->second.format_.video.timeBaseDen; + videoTimeBaseData[0] = media.num; + videoTimeBaseData[1] = media.den; videoFps = torch::zeros({1}, torch::kFloat); float* videoFpsData = videoFps.data_ptr(); - videoFpsData[0] = it->second.format_.video.fps; + videoFpsData[0] = header.fps; videoDuration = torch::zeros({1}, torch::kLong); int64_t* videoDurationData = videoDuration.data_ptr(); - videoDurationData[0] = it->second.format_.video.duration; + AVRational avr = {(int)media.num, (int)media.den}; + videoDurationData[0] = av_rescale_q(media.duration, AV_TIME_BASE_Q, avr); } else { VLOG(1) << "Miss video stream"; } @@ -246,39 +315,53 @@ torch::List readVideo( torch::Tensor audioTimeBase = torch::zeros({0}, torch::kInt); torch::Tensor audioSampleRate = torch::zeros({0}, torch::kInt); torch::Tensor audioDuration = torch::zeros({0}, torch::kLong); - if (readAudioStream == 1) { - auto it = decoderOutput.media_data_.find(TYPE_AUDIO); - if (it != decoderOutput.media_data_.end()) { - VLOG(1) << "Find audio stream"; - int64_t numAudioSamples = 0, outAudioChannels = 0, numAudioFrames = 0; - getAudioMeta( - decoderOutput, numAudioSamples, outAudioChannels, numAudioFrames); - VLOG(2) << "numAudioSamples: " << numAudioSamples; - VLOG(2) << "outAudioChannels: " << outAudioChannels; - VLOG(2) << "numAudioFrames: " << numAudioFrames; + if (succeeded && readAudioStream == 1) { + if (!audioMessages.empty()) { + const auto& header = audioMessages[0].header; + const auto& media = header.format; + const auto& format = media.format.audio; + int64_t outAudioChannels = format.channels; + int bytesPerSample = + av_get_bytes_per_sample(static_cast(format.format)); + + int numAudioFrames = audioMessages.size(); if (getPtsOnly == 0) { + int64_t frameSizeTotal = 0; + for (auto const& audioMessage : audioMessages) { + frameSizeTotal += audioMessage.payload->length(); + } + + CHECK_EQ(frameSizeTotal % (outAudioChannels * bytesPerSample), 0); + int64_t numAudioSamples = + frameSizeTotal / (outAudioChannels * bytesPerSample); + audioFrame = torch::zeros({numAudioSamples, outAudioChannels}, torch::kFloat); } audioFramePts = torch::zeros({numAudioFrames}, torch::kLong); + + VLOG(2) << "audio duration: " << media.duration + << ", channels: " << format.channels + << ", sample rate: " << format.samples << ", num: " << media.num + << ", den: " << media.den; + fillAudioTensor( - decoderOutput.media_data_[TYPE_AUDIO].frames_, - audioFrame, - audioFramePts); + audioMessages, audioFrame, audioFramePts, media.num, media.den); audioTimeBase = torch::zeros({2}, torch::kInt); int* audioTimeBaseData = audioTimeBase.data_ptr(); - audioTimeBaseData[0] = it->second.format_.audio.timeBaseNum; - audioTimeBaseData[1] = it->second.format_.audio.timeBaseDen; + audioTimeBaseData[0] = media.num; + audioTimeBaseData[1] = media.den; audioSampleRate = torch::zeros({1}, torch::kInt); int* audioSampleRateData = audioSampleRate.data_ptr(); - audioSampleRateData[0] = it->second.format_.audio.samples; + audioSampleRateData[0] = format.samples; audioDuration = torch::zeros({1}, torch::kLong); int64_t* audioDurationData = audioDuration.data_ptr(); - audioDurationData[0] = it->second.format_.audio.duration; + AVRational avr = {(int)media.num, (int)media.den}; + audioDurationData[0] = av_rescale_q(media.duration, AV_TIME_BASE_Q, avr); } else { VLOG(1) << "Miss audio stream"; } @@ -388,57 +471,90 @@ torch::List probeVideo( bool isReadFile, const torch::Tensor& input_video, std::string videoPath) { - unique_ptr params = util::getDecoderParams( + DecoderParameters params = getDecoderParams( + 0, // videoStartUs + -1, // videoEndUs 0, // seekFrameMargin - 0, // getPtsOnly + 1, // getPtsOnly 1, // readVideoStream 0, // width 0, // height 0, // minDimension - 0, // videoStartPts - 0, // videoEndPts - 0, // videoTimeBaseNum - 1, // videoTimeBaseDen 1, // readAudioStream 0, // audioSamples - 0, // audioChannels - 0, // audioStartPts - 0, // audioEndPts - 0, // audioTimeBaseNum - 1 // audioTimeBaseDen + 0 // audioChannels ); - FfmpegDecoder decoder; - DecoderOutput decoderOutput; + SyncDecoder decoder; + DecoderOutputMessage audioMessage, videoMessage; + DecoderInCallback callback = nullptr; + + const auto now = std::chrono::system_clock::now(); + + VLOG(2) << "Video probing [" << videoPath << "] has started"; + if (isReadFile) { - decoder.probeFile(std::move(params), videoPath, decoderOutput); + params.uri = videoPath; + } else { + callback = MemoryBuffer::getCallback( + input_video.data_ptr(), input_video.size(0)); + } + + bool succeeded; + bool gotAudio = false, gotVideo = false; + if ((succeeded = decoder.init(params, std::move(callback)))) { + int res; + DecoderOutputMessage msg; + while (0 == (res = decoder.decode(&msg, decoderTimeoutMs)) && + (!gotAudio || !gotVideo)) { + if (msg.header.format.type == TYPE_VIDEO && !gotVideo) { + videoMessage = std::move(msg); + gotVideo = true; + } + if (msg.header.format.type == TYPE_AUDIO && !gotAudio) { + audioMessage = std::move(msg); + gotAudio = true; + } + msg.payload.reset(); + } + succeeded = (res == 0 || res == ENODATA); + decoder.shutdown(); + + const auto then = std::chrono::system_clock::now(); + VLOG(2) << "Prob decoding [" << videoPath << "] has finished, " + << std::chrono::duration_cast(then - now) + .count() + << " us"; } else { - decoder.probeMemory( - std::move(params), - input_video.data_ptr(), - input_video.size(0), - decoderOutput); + LOG(ERROR) << "Decoder initialization has failed"; } + // video section torch::Tensor videoTimeBase = torch::zeros({0}, torch::kInt); torch::Tensor videoFps = torch::zeros({0}, torch::kFloat); torch::Tensor videoDuration = torch::zeros({0}, torch::kLong); - auto it = decoderOutput.media_data_.find(TYPE_VIDEO); - if (it != decoderOutput.media_data_.end()) { - VLOG(1) << "Find video stream"; + if (succeeded && gotVideo) { videoTimeBase = torch::zeros({2}, torch::kInt); int* videoTimeBaseData = videoTimeBase.data_ptr(); - videoTimeBaseData[0] = it->second.format_.video.timeBaseNum; - videoTimeBaseData[1] = it->second.format_.video.timeBaseDen; + const auto& header = videoMessage.header; + const auto& media = header.format; + + videoTimeBaseData[0] = media.num; + videoTimeBaseData[1] = media.den; videoFps = torch::zeros({1}, torch::kFloat); float* videoFpsData = videoFps.data_ptr(); - videoFpsData[0] = it->second.format_.video.fps; + videoFpsData[0] = header.fps; videoDuration = torch::zeros({1}, torch::kLong); int64_t* videoDurationData = videoDuration.data_ptr(); - videoDurationData[0] = it->second.format_.video.duration; + AVRational avr = {(int)media.num, (int)media.den}; + videoDurationData[0] = av_rescale_q(media.duration, AV_TIME_BASE_Q, avr); + + VLOG(2) << "Prob fps: " << header.fps << ", duration: " << media.duration + << ", num: " << media.num << ", den: " << media.den; + } else { VLOG(1) << "Miss video stream"; } @@ -448,21 +564,28 @@ torch::List probeVideo( torch::Tensor audioSampleRate = torch::zeros({0}, torch::kInt); torch::Tensor audioDuration = torch::zeros({0}, torch::kLong); - it = decoderOutput.media_data_.find(TYPE_AUDIO); - if (it != decoderOutput.media_data_.end()) { - VLOG(1) << "Find audio stream"; + if (succeeded && gotAudio) { audioTimeBase = torch::zeros({2}, torch::kInt); int* audioTimeBaseData = audioTimeBase.data_ptr(); - audioTimeBaseData[0] = it->second.format_.audio.timeBaseNum; - audioTimeBaseData[1] = it->second.format_.audio.timeBaseDen; + const auto& header = audioMessage.header; + const auto& media = header.format; + const auto& format = media.format.audio; + + audioTimeBaseData[0] = media.num; + audioTimeBaseData[1] = media.den; audioSampleRate = torch::zeros({1}, torch::kInt); int* audioSampleRateData = audioSampleRate.data_ptr(); - audioSampleRateData[0] = it->second.format_.audio.samples; + audioSampleRateData[0] = format.samples; audioDuration = torch::zeros({1}, torch::kLong); int64_t* audioDurationData = audioDuration.data_ptr(); - audioDurationData[0] = it->second.format_.audio.duration; + AVRational avr = {(int)media.num, (int)media.den}; + audioDurationData[0] = av_rescale_q(media.duration, AV_TIME_BASE_Q, avr); + + VLOG(2) << "Prob sample rate: " << format.samples + << ", duration: " << media.duration << ", num: " << media.num + << ", den: " << media.den; } else { VLOG(1) << "Miss audio stream"; } diff --git a/torchvision/csrc/cpu/video_reader/util.cpp b/torchvision/csrc/cpu/video_reader/util.cpp deleted file mode 100644 index ae3c3df0f0a..00000000000 --- a/torchvision/csrc/cpu/video_reader/util.cpp +++ /dev/null @@ -1,60 +0,0 @@ -#include "util.h" - -using namespace std; - -namespace util { - -unique_ptr getDecoderParams( - double seekFrameMargin, - int64_t getPtsOnly, - int64_t readVideoStream, - int videoWidth, - int videoHeight, - int videoMinDimension, - int64_t videoStartPts, - int64_t videoEndPts, - int videoTimeBaseNum, - int videoTimeBaseDen, - int64_t readAudioStream, - int audioSamples, - int audioChannels, - int64_t audioStartPts, - int64_t audioEndPts, - int audioTimeBaseNum, - int audioTimeBaseDen) { - unique_ptr params = make_unique(); - - if (readVideoStream == 1) { - params->formats.emplace( - MediaType::TYPE_VIDEO, MediaFormat(MediaType::TYPE_VIDEO)); - MediaFormat& videoFormat = params->formats[MediaType::TYPE_VIDEO]; - - videoFormat.format.video.width = videoWidth; - videoFormat.format.video.height = videoHeight; - videoFormat.format.video.minDimension = videoMinDimension; - videoFormat.format.video.startPts = videoStartPts; - videoFormat.format.video.endPts = videoEndPts; - videoFormat.format.video.timeBaseNum = videoTimeBaseNum; - videoFormat.format.video.timeBaseDen = videoTimeBaseDen; - } - - if (readAudioStream == 1) { - params->formats.emplace( - MediaType::TYPE_AUDIO, MediaFormat(MediaType::TYPE_AUDIO)); - MediaFormat& audioFormat = params->formats[MediaType::TYPE_AUDIO]; - - audioFormat.format.audio.samples = audioSamples; - audioFormat.format.audio.channels = audioChannels; - audioFormat.format.audio.startPts = audioStartPts; - audioFormat.format.audio.endPts = audioEndPts; - audioFormat.format.audio.timeBaseNum = audioTimeBaseNum; - audioFormat.format.audio.timeBaseDen = audioTimeBaseDen; - } - - params->seekFrameMargin = seekFrameMargin; - params->getPtsOnly = getPtsOnly; - - return params; -} - -} // namespace util diff --git a/torchvision/csrc/cpu/video_reader/util.h b/torchvision/csrc/cpu/video_reader/util.h deleted file mode 100644 index 6b5fd55388b..00000000000 --- a/torchvision/csrc/cpu/video_reader/util.h +++ /dev/null @@ -1,26 +0,0 @@ -#pragma once -#include -#include "FfmpegDecoder.h" - -namespace util { - -std::unique_ptr getDecoderParams( - double seekFrameMargin, - int64_t getPtsOnly, - int64_t readVideoStream, - int videoWidth, - int videoHeight, - int videoMinDimension, - int64_t videoStartPts, - int64_t videoEndPts, - int videoTimeBaseNum, - int videoTimeBaseDen, - int64_t readAudioStream, - int audioSamples, - int audioChannels, - int64_t audioStartPts, - int64_t audioEndPts, - int audioTimeBaseNum, - int audioTimeBaseDen); - -} // namespace util