Skip to content

Commit

Permalink
Integrated base decoder into VideoReader class and video_utils.py (py…
Browse files Browse the repository at this point in the history
…torch#1766)

Summary:
Pull Request resolved: pytorch#1766

Replaced FfmpegDecoder (incompativle with VUE) by base decoder (compatible with VUE).
Modified python utilities video_utils.py for internal simplification. Public interface got preserved.

Differential Revision: D19415903

fbshipit-source-id: c750e2b74573a60f7f6930f1859c577e26245b89
  • Loading branch information
Yuri Putivsky authored and facebook-github-bot committed Jan 23, 2020
1 parent bf98744 commit 08d5f42
Show file tree
Hide file tree
Showing 52 changed files with 696 additions and 2,352 deletions.
32 changes: 6 additions & 26 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,41 +155,21 @@ def get_extensions():
ffmpeg_root = os.path.dirname(ffmpeg_bin)
ffmpeg_include_dir = os.path.join(ffmpeg_root, 'include')

# TorchVision video reader
# TorchVision base decoder + video reader
video_reader_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video_reader')
video_reader_src = glob.glob(os.path.join(video_reader_src_dir, "*.cpp"))

ext_modules.append(
CppExtension(
'torchvision.video_reader',
video_reader_src,
include_dirs=[
video_reader_src_dir,
ffmpeg_include_dir,
extensions_dir,
],
libraries=[
'avcodec',
'avformat',
'avutil',
'swresample',
'swscale',
],
extra_compile_args=["-std=c++14"],
extra_link_args=["-std=c++14"],
)
)

# TorchVision base decoder
base_decoder_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'decoder')
base_decoder_src = glob.glob(os.path.join(base_decoder_src_dir, "[!sync_decoder_test]*.cpp"))

combined_src = video_reader_src + base_decoder_src

ext_modules.append(
CppExtension(
'torchvision.base_decoder',
base_decoder_src,
'torchvision.video_reader',
combined_src,
include_dirs=[
base_decoder_src_dir,
video_reader_src_dir,
ffmpeg_include_dir,
extensions_dir,
],
Expand Down
6 changes: 2 additions & 4 deletions torchvision/csrc/cpu/decoder/audio_sampler.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
// Copyright 2004-present Facebook. All Rights Reserved.

#include "audio_sampler.h"
#include "pytorch/vision/torchvision/csrc/cpu/decoder/audio_sampler.h"
#include <c10/util/Logging.h>
#include "util.h"
#include "pytorch/vision/torchvision/csrc/cpu/decoder/util.h"

// www.ffmpeg.org/doxygen/1.1/doc_2examples_2resampling_audio_8c-example.html#a24

Expand Down
4 changes: 1 addition & 3 deletions torchvision/csrc/cpu/decoder/audio_sampler.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
// Copyright 2004-present Facebook. All Rights Reserved.

#pragma once

#include "defs.h"
#include "pytorch/vision/torchvision/csrc/cpu/decoder/defs.h"

extern "C" {
#include <libswresample/swresample.h>
Expand Down
54 changes: 20 additions & 34 deletions torchvision/csrc/cpu/decoder/audio_stream.cpp
Original file line number Diff line number Diff line change
@@ -1,18 +1,28 @@
// Copyright 2004-present Facebook. All Rights Reserved.

#include "audio_stream.h"
#include "pytorch/vision/torchvision/csrc/cpu/decoder/audio_stream.h"
#include <c10/util/Logging.h>
#include <limits>
#include "util.h"
#include "pytorch/vision/torchvision/csrc/cpu/decoder/util.h"

namespace ffmpeg {

namespace {
bool operator==(const AudioFormat& x, const AVFrame& y) {
return x.samples == y.sample_rate && x.channels == y.channels &&
x.format == y.format;
}

bool operator==(const AudioFormat& x, const AVCodecContext& y) {
return x.samples == y.sample_rate && x.channels == y.channels &&
x.format == y.sample_fmt;
}

AudioFormat& toAudioFormat(AudioFormat& x, const AVFrame& y) {
x.samples = y.sample_rate;
x.channels = y.channels;
x.format = y.format;
return x;
}

AudioFormat& toAudioFormat(AudioFormat& x, const AVCodecContext& y) {
x.samples = y.sample_rate;
x.channels = y.channels;
Expand Down Expand Up @@ -65,12 +75,15 @@ int AudioStream::initFormat() {

int AudioStream::estimateBytes(bool flush) {
ensureSampler();
if (!(sampler_->getInputFormat().audio == *codecCtx_)) {
// check if input format gets changed
if (flush ? !(sampler_->getInputFormat().audio == *codecCtx_)
: !(sampler_->getInputFormat().audio == *frame_)) {
// - reinit sampler
SamplerParameters params;
params.type = format_.type;
params.out = format_.format;
toAudioFormat(params.in.audio, *codecCtx_);
flush ? toAudioFormat(params.in.audio, *codecCtx_)
: toAudioFormat(params.in.audio, *frame_);
if (flush || !sampler_->init(params)) {
return -1;
}
Expand All @@ -84,39 +97,12 @@ int AudioStream::estimateBytes(bool flush) {
<< ", channels: " << format_.format.audio.channels
<< ", format: " << format_.format.audio.format;
}
return sampler_->getSamplesBytes(frame_);
return sampler_->getSamplesBytes(flush ? nullptr : frame_);
}

int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) {
ensureSampler();
return sampler_->sample(flush ? nullptr : frame_, out);
}

void AudioStream::setHeader(DecoderHeader* header) {
header->seqno = numGenerator_++;

if (codecCtx_->time_base.num != 0) {
header->pts = av_rescale_q(
av_frame_get_best_effort_timestamp(frame_),
codecCtx_->time_base,
AV_TIME_BASE_Q);
} else {
// If the codec time_base is missing then we would've skipped the
// rescalePackage step to rescale to codec time_base, so here we can
// rescale straight from the stream time_base into AV_TIME_BASE_Q.
header->pts = av_rescale_q(
av_frame_get_best_effort_timestamp(frame_),
inputCtx_->streams[format_.stream]->time_base,
AV_TIME_BASE_Q);
}

if (convertPtsToWallTime_) {
keeper_.adjust(header->pts);
}

header->keyFrame = 1;
header->fps = std::numeric_limits<double>::quiet_NaN();
header->format = format_;
}

} // namespace ffmpeg
9 changes: 2 additions & 7 deletions torchvision/csrc/cpu/decoder/audio_stream.h
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
// Copyright 2004-present Facebook. All Rights Reserved.

#pragma once

#include "audio_sampler.h"
#include "stream.h"
#include "time_keeper.h"
#include "pytorch/vision/torchvision/csrc/cpu/decoder/audio_sampler.h"
#include "pytorch/vision/torchvision/csrc/cpu/decoder/stream.h"

namespace ffmpeg {

Expand All @@ -25,13 +22,11 @@ class AudioStream : public Stream {
int initFormat() override;
int estimateBytes(bool flush) override;
int copyFrameBytes(ByteStorage* out, bool flush) override;
void setHeader(DecoderHeader* header) override;

void ensureSampler();

private:
std::unique_ptr<AudioSampler> sampler_;
TimeKeeper keeper_;
};

} // namespace ffmpeg
4 changes: 1 addition & 3 deletions torchvision/csrc/cpu/decoder/cc_stream.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
// Copyright 2004-present Facebook. All Rights Reserved.

#include "cc_stream.h"
#include "pytorch/vision/torchvision/csrc/cpu/decoder/cc_stream.h"

namespace ffmpeg {

Expand Down
4 changes: 1 addition & 3 deletions torchvision/csrc/cpu/decoder/cc_stream.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
// Copyright 2004-present Facebook. All Rights Reserved.

#pragma once

#include "subtitle_stream.h"
#include "pytorch/vision/torchvision/csrc/cpu/decoder/subtitle_stream.h"

namespace ffmpeg {

Expand Down
76 changes: 36 additions & 40 deletions torchvision/csrc/cpu/decoder/decoder.cpp
Original file line number Diff line number Diff line change
@@ -1,22 +1,18 @@
// Copyright 2004-present Facebook. All Rights Reserved.

#include "decoder.h"
#include "pytorch/vision/torchvision/csrc/cpu/decoder/decoder.h"
#include <c10/util/Logging.h>
#include <future>
#include <iostream>
#include <mutex>
#include "audio_stream.h"
#include "cc_stream.h"
#include "subtitle_stream.h"
#include "util.h"
#include "video_stream.h"
#include "pytorch/vision/torchvision/csrc/cpu/decoder/audio_stream.h"
#include "pytorch/vision/torchvision/csrc/cpu/decoder/cc_stream.h"
#include "pytorch/vision/torchvision/csrc/cpu/decoder/subtitle_stream.h"
#include "pytorch/vision/torchvision/csrc/cpu/decoder/util.h"
#include "pytorch/vision/torchvision/csrc/cpu/decoder/video_stream.h"

namespace ffmpeg {

namespace {

constexpr ssize_t kMinSeekBufferSize = 1024;
constexpr ssize_t kMaxSeekBufferSize = 4 * 1024;
constexpr size_t kIoBufferSize = 4 * 1024;
constexpr size_t kLogBufferSize = 1024;

Expand Down Expand Up @@ -205,7 +201,7 @@ void Decoder::initOnce() {
av_lockmgr_register(&ffmpeg_lock);
av_log_set_callback(Decoder::logFunction);
av_log_set_level(AV_LOG_ERROR);
LOG(INFO) << "Registered ffmpeg libs";
VLOG(1) << "Registered ffmpeg libs";
});
}

Expand Down Expand Up @@ -248,23 +244,21 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) {
return false;
}

bool canSeek = in(nullptr, 0, 0) == 0;
bool canSeek = in(nullptr, 0, 0, 0) == 0;

if (!seekableBuffer_.init(
std::forward<DecoderInCallback>(in),
kMinSeekBufferSize,
kMaxSeekBufferSize,
params_.timeoutMs)) {
LOG(ERROR) << "seekable buffer initialization failed";
av_free(avioCtxBuffer);
avformat_close_input(&tmpCtx);
cleanUp();
return false;
}
seekableBuffer_.init(std::forward<DecoderInCallback>(in));

if (params_.isImage) {
ImageType type = ImageType::UNKNOWN;
if (!seekableBuffer_.detect(params_.timeoutMs, &type)) {
LOG(ERROR) << "can't detect image type";
av_free(avioCtxBuffer);
avformat_close_input(&tmpCtx);
cleanUp();
return false;
}
const char* fmtName = "image2";
switch (seekableBuffer_.getImageType()) {
switch (type) {
case ImageType::JPEG:
fmtName = "jpeg_pipe";
break;
Expand Down Expand Up @@ -364,16 +358,15 @@ bool Decoder::init(const DecoderParameters& params, DecoderInCallback&& in) {

onInit();

if (params.startOffsetMs != 0) {
av_seek_frame(
inputCtx_,
-1,
params.startOffsetMs * AV_TIME_BASE / 1000,
AVSEEK_FLAG_FRAME | AVSEEK_FLAG_ANY);
if (params.startOffset != 0) {
auto offset = params.startOffset <= params.seekAccuracy
? 0
: params.startOffset - params.seekAccuracy;

av_seek_frame(inputCtx_, -1, offset, AVSEEK_FLAG_BACKWARD);
}

LOG(INFO) << "Decoder initialized, log level: " << params_.logLevel;
outOfRange_ = false;
VLOG(1) << "Decoder initialized, log level: " << params_.logLevel;
return true;
}

Expand Down Expand Up @@ -418,6 +411,7 @@ bool Decoder::activateStreams() {
return false;
}
streams_.emplace(i, std::move(stream));
inRange_.set(i, true);
}
}

Expand Down Expand Up @@ -459,7 +453,7 @@ void Decoder::cleanUp() {
}

int Decoder::getBytes(size_t workingTimeInMs) {
if (outOfRange_) {
if (inRange_.none()) {
return ENODATA;
}
// decode frames until cache is full and leave thread
Expand Down Expand Up @@ -499,13 +493,11 @@ int Decoder::getBytes(size_t workingTimeInMs) {

// get stream
auto stream = findByIndex(avPacket.stream_index);
if (stream == nullptr) {
if (stream == nullptr || !inRange_.test(stream->getIndex())) {
av_packet_unref(&avPacket);
continue;
}

stream->rescalePackage(&avPacket);

AVPacket copyPacket = avPacket;

size_t numConsecutiveNoBytes = 0;
Expand Down Expand Up @@ -572,8 +564,10 @@ int Decoder::processPacket(Stream* stream, AVPacket* packet) {
if ((result = stream->decodeFrame(packet, &gotFrame)) >= 0 && gotFrame &&
stream->getFrameBytes(&msg, params_.headerOnly) > 0) {
// check end offset
if (params_.endOffsetMs <= 0 ||
!(outOfRange_ = msg.header.pts > params_.endOffsetMs * 1000)) {
bool endInRange =
params_.endOffset <= 0 || msg.header.pts <= params_.endOffset;
inRange_.set(stream->getIndex(), endInRange);
if (endInRange && msg.header.pts >= params_.startOffset) {
push(std::move(msg));
}
}
Expand All @@ -587,8 +581,10 @@ void Decoder::flushStreams() {
while (msg.payload = createByteStorage(0),
stream.second->flush(&msg, params_.headerOnly) > 0) {
// check end offset
if (params_.endOffsetMs <= 0 ||
!(outOfRange_ = msg.header.pts > params_.endOffsetMs * 1000)) {
bool endInRange =
params_.endOffset <= 0 || msg.header.pts <= params_.endOffset;
inRange_.set(stream.second->getIndex(), endInRange);
if (endInRange && msg.header.pts >= params_.startOffset) {
push(std::move(msg));
}
}
Expand Down
10 changes: 5 additions & 5 deletions torchvision/csrc/cpu/decoder/decoder.h
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
// Copyright 2004-present Facebook. All Rights Reserved.

#pragma once

#include "seekable_buffer.h"
#include "stream.h"
#include <bitset>
#include <unordered_map>
#include "pytorch/vision/torchvision/csrc/cpu/decoder/seekable_buffer.h"
#include "pytorch/vision/torchvision/csrc/cpu/decoder/stream.h"

namespace ffmpeg {

Expand Down Expand Up @@ -72,6 +72,6 @@ class Decoder : public MediaDecoder {
AVFormatContext* inputCtx_{nullptr};
AVIOContext* avioCtx_{nullptr};
std::unordered_map<ssize_t, std::unique_ptr<Stream>> streams_;
bool outOfRange_{false};
std::bitset<64> inRange_;
};
} // namespace ffmpeg
Loading

0 comments on commit 08d5f42

Please sign in to comment.