Skip to content

Commit

Permalink
Integrated base decoder into VideoReader class and video_utils.py (#1766
Browse files Browse the repository at this point in the history
)

Summary:
Pull Request resolved: #1766

Replaced FfmpegDecoder (incompativle with VUE) by base decoder (compatible with VUE).
Modified python utilities video_utils.py for internal simplification. Public interface got preserved.

Reviewed By: fmassa

Differential Revision: D19415903

fbshipit-source-id: f55ac5ecf5ddaa218f48e69317814a5cc274ecf2
  • Loading branch information
Yuri Putivsky authored and facebook-github-bot committed Feb 3, 2020
1 parent ccdc9f7 commit a7b2287
Show file tree
Hide file tree
Showing 53 changed files with 1,144 additions and 2,597 deletions.
32 changes: 6 additions & 26 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,41 +155,21 @@ def get_extensions():
ffmpeg_root = os.path.dirname(ffmpeg_bin)
ffmpeg_include_dir = os.path.join(ffmpeg_root, 'include')

# TorchVision video reader
# TorchVision base decoder + video reader
video_reader_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video_reader')
video_reader_src = glob.glob(os.path.join(video_reader_src_dir, "*.cpp"))

ext_modules.append(
CppExtension(
'torchvision.video_reader',
video_reader_src,
include_dirs=[
video_reader_src_dir,
ffmpeg_include_dir,
extensions_dir,
],
libraries=[
'avcodec',
'avformat',
'avutil',
'swresample',
'swscale',
],
extra_compile_args=["-std=c++14"],
extra_link_args=["-std=c++14"],
)
)

# TorchVision base decoder
base_decoder_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'decoder')
base_decoder_src = glob.glob(os.path.join(base_decoder_src_dir, "[!sync_decoder_test]*.cpp"))

combined_src = video_reader_src + base_decoder_src

ext_modules.append(
CppExtension(
'torchvision.base_decoder',
base_decoder_src,
'torchvision.video_reader',
combined_src,
include_dirs=[
base_decoder_src_dir,
video_reader_src_dir,
ffmpeg_include_dir,
extensions_dir,
],
Expand Down
40 changes: 22 additions & 18 deletions torchvision/csrc/cpu/decoder/audio_sampler.cpp
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
// Copyright 2004-present Facebook. All Rights Reserved.

#include "audio_sampler.h"
#include <c10/util/Logging.h>
#include "util.h"

// www.ffmpeg.org/doxygen/1.1/doc_2examples_2resampling_audio_8c-example.html#a24

#ifndef SWR_CH_MAX
#define SWR_CH_MAX 32
#endif
#define AVRESAMPLE_MAX_CHANNELS 32

// www.ffmpeg.org/doxygen/1.1/doc_2examples_2resampling_audio_8c-example.html#a24
namespace ffmpeg {

namespace {
Expand Down Expand Up @@ -94,17 +89,20 @@ int AudioSampler::numOutputSamples(int inSamples) const {
}

int AudioSampler::getSamplesBytes(AVFrame* frame) const {
return av_get_bytes_per_sample((AVSampleFormat)params_.out.audio.format) *
numOutputSamples(frame ? frame->nb_samples : 0) *
params_.out.audio.channels;
return av_samples_get_buffer_size(
nullptr,
params_.out.audio.channels,
numOutputSamples(frame ? frame->nb_samples : 0),
(AVSampleFormat)params_.out.audio.format,
1);
}

int AudioSampler::sample(
const uint8_t* inPlanes[],
int inNumSamples,
ByteStorage* out,
int outNumSamples) {
uint8_t* outPlanes[SWR_CH_MAX] = {nullptr};
uint8_t* outPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr};
int result;
if ((result = preparePlanes(
params_.out.audio, out->writableTail(), outNumSamples, outPlanes)) <
Expand Down Expand Up @@ -140,9 +138,12 @@ int AudioSampler::sample(AVFrame* frame, ByteStorage* out) {
return 0;
}

const auto samplesBytes =
av_get_bytes_per_sample((AVSampleFormat)params_.out.audio.format) *
outNumSamples * params_.out.audio.channels;
const auto samplesBytes = av_samples_get_buffer_size(
nullptr,
params_.out.audio.channels,
outNumSamples,
(AVSampleFormat)params_.out.audio.format,
1);

// bytes must be allocated
CHECK_LE(samplesBytes, out->tail());
Expand All @@ -167,14 +168,17 @@ int AudioSampler::sample(const ByteStorage* in, ByteStorage* out) {
return 0;
}

const auto samplesBytes =
av_get_bytes_per_sample((AVSampleFormat)params_.out.audio.format) *
outNumSamples * params_.out.audio.channels;
const auto samplesBytes = av_samples_get_buffer_size(
nullptr,
params_.out.audio.channels,
outNumSamples,
(AVSampleFormat)params_.out.audio.format,
1);

out->clear();
out->ensure(samplesBytes);

uint8_t* inPlanes[SWR_CH_MAX] = {nullptr};
uint8_t* inPlanes[AVRESAMPLE_MAX_CHANNELS] = {nullptr};
int result;
if (in &&
(result = preparePlanes(
Expand Down
2 changes: 0 additions & 2 deletions torchvision/csrc/cpu/decoder/audio_sampler.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
// Copyright 2004-present Facebook. All Rights Reserved.

#pragma once

#include "defs.h"
Expand Down
53 changes: 20 additions & 33 deletions torchvision/csrc/cpu/decoder/audio_stream.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
// Copyright 2004-present Facebook. All Rights Reserved.

#include "audio_stream.h"
#include <c10/util/Logging.h>
#include <limits>
Expand All @@ -8,11 +6,23 @@
namespace ffmpeg {

namespace {
bool operator==(const AudioFormat& x, const AVFrame& y) {
return x.samples == y.sample_rate && x.channels == y.channels &&
x.format == y.format;
}

bool operator==(const AudioFormat& x, const AVCodecContext& y) {
return x.samples == y.sample_rate && x.channels == y.channels &&
x.format == y.sample_fmt;
}

AudioFormat& toAudioFormat(AudioFormat& x, const AVFrame& y) {
x.samples = y.sample_rate;
x.channels = y.channels;
x.format = y.format;
return x;
}

AudioFormat& toAudioFormat(AudioFormat& x, const AVCodecContext& y) {
x.samples = y.sample_rate;
x.channels = y.channels;
Expand All @@ -29,7 +39,8 @@ AudioStream::AudioStream(
: Stream(
inputCtx,
MediaFormat::makeMediaFormat(format, index),
convertPtsToWallTime) {}
convertPtsToWallTime,
0) {}

AudioStream::~AudioStream() {
if (sampler_) {
Expand Down Expand Up @@ -65,12 +76,15 @@ int AudioStream::initFormat() {

int AudioStream::estimateBytes(bool flush) {
ensureSampler();
if (!(sampler_->getInputFormat().audio == *codecCtx_)) {
// check if input format gets changed
if (flush ? !(sampler_->getInputFormat().audio == *codecCtx_)
: !(sampler_->getInputFormat().audio == *frame_)) {
// - reinit sampler
SamplerParameters params;
params.type = format_.type;
params.out = format_.format;
toAudioFormat(params.in.audio, *codecCtx_);
flush ? toAudioFormat(params.in.audio, *codecCtx_)
: toAudioFormat(params.in.audio, *frame_);
if (flush || !sampler_->init(params)) {
return -1;
}
Expand All @@ -84,39 +98,12 @@ int AudioStream::estimateBytes(bool flush) {
<< ", channels: " << format_.format.audio.channels
<< ", format: " << format_.format.audio.format;
}
return sampler_->getSamplesBytes(frame_);
return sampler_->getSamplesBytes(flush ? nullptr : frame_);
}

int AudioStream::copyFrameBytes(ByteStorage* out, bool flush) {
ensureSampler();
return sampler_->sample(flush ? nullptr : frame_, out);
}

void AudioStream::setHeader(DecoderHeader* header) {
header->seqno = numGenerator_++;

if (codecCtx_->time_base.num != 0) {
header->pts = av_rescale_q(
av_frame_get_best_effort_timestamp(frame_),
codecCtx_->time_base,
AV_TIME_BASE_Q);
} else {
// If the codec time_base is missing then we would've skipped the
// rescalePackage step to rescale to codec time_base, so here we can
// rescale straight from the stream time_base into AV_TIME_BASE_Q.
header->pts = av_rescale_q(
av_frame_get_best_effort_timestamp(frame_),
inputCtx_->streams[format_.stream]->time_base,
AV_TIME_BASE_Q);
}

if (convertPtsToWallTime_) {
keeper_.adjust(header->pts);
}

header->keyFrame = 1;
header->fps = std::numeric_limits<double>::quiet_NaN();
header->format = format_;
}

} // namespace ffmpeg
5 changes: 0 additions & 5 deletions torchvision/csrc/cpu/decoder/audio_stream.h
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
// Copyright 2004-present Facebook. All Rights Reserved.

#pragma once

#include "audio_sampler.h"
#include "stream.h"
#include "time_keeper.h"

namespace ffmpeg {

Expand All @@ -25,13 +22,11 @@ class AudioStream : public Stream {
int initFormat() override;
int estimateBytes(bool flush) override;
int copyFrameBytes(ByteStorage* out, bool flush) override;
void setHeader(DecoderHeader* header) override;

void ensureSampler();

private:
std::unique_ptr<AudioSampler> sampler_;
TimeKeeper keeper_;
};

} // namespace ffmpeg
2 changes: 0 additions & 2 deletions torchvision/csrc/cpu/decoder/cc_stream.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
// Copyright 2004-present Facebook. All Rights Reserved.

#include "cc_stream.h"

namespace ffmpeg {
Expand Down
2 changes: 0 additions & 2 deletions torchvision/csrc/cpu/decoder/cc_stream.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
// Copyright 2004-present Facebook. All Rights Reserved.

#pragma once

#include "subtitle_stream.h"
Expand Down
Loading

0 comments on commit a7b2287

Please sign in to comment.