Skip to content

Commit

Permalink
Base decoder for video. (pytorch#1747)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch#1747

Pull Request resolved: pytorch#1746

Added the implementation of ffmpeg based decoder with functionality that can be used in VUE and TorchVision.

Differential Revision: D19358914

fbshipit-source-id: 9344a0d1531f8518fceec1032f62b0ac9d5f9975
  • Loading branch information
Yuri Putivsky authored and facebook-github-bot committed Jan 14, 2020
1 parent 90f5aac commit 6049f5b
Show file tree
Hide file tree
Showing 29 changed files with 3,330 additions and 11 deletions.
48 changes: 37 additions & 11 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,17 +127,6 @@ def get_extensions():

include_dirs = [extensions_dir]

ffmpeg_exe = distutils.spawn.find_executable('ffmpeg')
has_ffmpeg = ffmpeg_exe is not None
if has_ffmpeg:
ffmpeg_bin = os.path.dirname(ffmpeg_exe)
ffmpeg_root = os.path.dirname(ffmpeg_bin)
ffmpeg_include_dir = os.path.join(ffmpeg_root, 'include')

# TorchVision video reader
video_reader_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video_reader')
video_reader_src = glob.glob(os.path.join(video_reader_src_dir, "*.cpp"))

ext_modules = [
extension(
'torchvision._C',
Expand All @@ -157,7 +146,19 @@ def get_extensions():
extra_compile_args=extra_compile_args,
)
)

ffmpeg_exe = distutils.spawn.find_executable('ffmpeg')
has_ffmpeg = ffmpeg_exe is not None

if has_ffmpeg:
ffmpeg_bin = os.path.dirname(ffmpeg_exe)
ffmpeg_root = os.path.dirname(ffmpeg_bin)
ffmpeg_include_dir = os.path.join(ffmpeg_root, 'include')

# TorchVision video reader
video_reader_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'video_reader')
video_reader_src = glob.glob(os.path.join(video_reader_src_dir, "*.cpp"))

ext_modules.append(
CppExtension(
'torchvision.video_reader',
Expand All @@ -179,6 +180,31 @@ def get_extensions():
)
)

# TorchVision base decoder
base_decoder_src_dir = os.path.join(this_dir, 'torchvision', 'csrc', 'cpu', 'decoder')
base_decoder_src = glob.glob(os.path.join(base_decoder_src_dir, "[!sync_decoder_test]*.cpp"))

ext_modules.append(
CppExtension(
'torchvision.base_decoder',
base_decoder_src,
include_dirs=[
base_decoder_src_dir,
ffmpeg_include_dir,
extensions_dir,
],
libraries=[
'avcodec',
'avformat',
'avutil',
'swresample',
'swscale',
],
extra_compile_args=["-std=c++14"],
extra_link_args=["-std=c++14"],
)
)

return ext_modules


Expand Down
199 changes: 199 additions & 0 deletions torchvision/csrc/cpu/decoder/audio_sampler.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
// Copyright 2004-present Facebook. All Rights Reserved.

#include "audio_sampler.h"
#include <c10/util/Logging.h>
#include "util.h"

// www.ffmpeg.org/doxygen/1.1/doc_2examples_2resampling_audio_8c-example.html#a24

#ifndef SWR_CH_MAX
#define SWR_CH_MAX 32
#endif

namespace ffmpeg {

namespace {
int preparePlanes(
const AudioFormat& fmt,
const uint8_t* buffer,
int numSamples,
uint8_t** planes) {
int result;
if ((result = av_samples_fill_arrays(
planes,
nullptr, // linesize is not needed
buffer,
fmt.channels,
numSamples,
(AVSampleFormat)fmt.format,
1)) < 0) {
LOG(ERROR) << "av_samples_fill_arrays failed, err: "
<< Util::generateErrorDesc(result)
<< ", numSamples: " << numSamples << ", fmt: " << fmt.format;
}
return result;
}
} // namespace

AudioSampler::AudioSampler(void* logCtx) : logCtx_(logCtx) {}

AudioSampler::~AudioSampler() {
cleanUp();
}

void AudioSampler::shutdown() {
cleanUp();
}

bool AudioSampler::init(const SamplerParameters& params) {
cleanUp();

if (params.type != MediaType::TYPE_AUDIO) {
LOG(ERROR) << "Invalid media type, expected MediaType::TYPE_AUDIO";
return false;
}

swrContext_ = swr_alloc_set_opts(
nullptr,
av_get_default_channel_layout(params.out.audio.channels),
(AVSampleFormat)params.out.audio.format,
params.out.audio.samples,
av_get_default_channel_layout(params.in.audio.channels),
(AVSampleFormat)params.in.audio.format,
params.in.audio.samples,
0,
logCtx_);
if (swrContext_ == nullptr) {
LOG(ERROR) << "Cannot allocate SwrContext";
return false;
}

int result;
if ((result = swr_init(swrContext_)) < 0) {
LOG(ERROR) << "swr_init faield, err: " << Util::generateErrorDesc(result)
<< ", in -> format: " << params.in.audio.format
<< ", channels: " << params.in.audio.channels
<< ", samples: " << params.in.audio.samples
<< ", out -> format: " << params.out.audio.format
<< ", channels: " << params.out.audio.channels
<< ", samples: " << params.out.audio.samples;
return false;
}

// set formats
params_ = params;
return true;
}

int AudioSampler::numOutputSamples(int inSamples) const {
return av_rescale_rnd(
swr_get_delay(swrContext_, params_.in.audio.samples) + inSamples,
params_.out.audio.samples,
params_.in.audio.samples,
AV_ROUND_UP);
}

int AudioSampler::getSamplesBytes(AVFrame* frame) const {
return av_get_bytes_per_sample((AVSampleFormat)params_.out.audio.format) *
numOutputSamples(frame ? frame->nb_samples : 0) *
params_.out.audio.channels;
}

int AudioSampler::sample(
const uint8_t* inPlanes[],
int inNumSamples,
ByteStorage* out,
int outNumSamples) {
uint8_t* outPlanes[SWR_CH_MAX] = {nullptr};
int result;
if ((result = preparePlanes(
params_.out.audio, out->writableTail(), outNumSamples, outPlanes)) <
0) {
return result;
}

if ((result = swr_convert(
swrContext_, &outPlanes[0], outNumSamples, inPlanes, inNumSamples)) <
0) {
LOG(ERROR) << "swr_convert faield, err: "
<< Util::generateErrorDesc(result);
return result;
}

CHECK_LE(result, outNumSamples);

if ((result = av_samples_get_buffer_size(
nullptr,
params_.out.audio.channels,
result,
(AVSampleFormat)params_.out.audio.format,
1)) > 0) {
out->append(result);
}
return result;
}

int AudioSampler::sample(AVFrame* frame, ByteStorage* out) {
const auto outNumSamples = numOutputSamples(frame ? frame->nb_samples : 0);

if (!outNumSamples) {
return 0;
}

const auto samplesBytes =
av_get_bytes_per_sample((AVSampleFormat)params_.out.audio.format) *
outNumSamples * params_.out.audio.channels;

// bytes must be allocated
CHECK_LE(samplesBytes, out->tail());

return sample(
frame ? (const uint8_t**)&frame->data[0] : nullptr,
frame ? frame->nb_samples : 0,
out,
outNumSamples);
}

int AudioSampler::sample(const ByteStorage* in, ByteStorage* out) {
const auto inSampleSize =
av_get_bytes_per_sample((AVSampleFormat)params_.in.audio.format);

const auto inNumSamples =
!in ? 0 : in->length() / inSampleSize / params_.in.audio.channels;

const auto outNumSamples = numOutputSamples(inNumSamples);

if (!outNumSamples) {
return 0;
}

const auto samplesBytes =
av_get_bytes_per_sample((AVSampleFormat)params_.out.audio.format) *
outNumSamples * params_.out.audio.channels;

out->clear();
out->ensure(samplesBytes);

uint8_t* inPlanes[SWR_CH_MAX] = {nullptr};
int result;
if (in &&
(result = preparePlanes(
params_.in.audio, in->data(), inNumSamples, inPlanes)) < 0) {
return result;
}

return sample(
in ? (const uint8_t**)inPlanes : nullptr,
inNumSamples,
out,
outNumSamples);
}

void AudioSampler::cleanUp() {
if (swrContext_) {
swr_free(&swrContext_);
swrContext_ = nullptr;
}
}

} // namespace ffmpeg
46 changes: 46 additions & 0 deletions torchvision/csrc/cpu/decoder/audio_sampler.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
// Copyright 2004-present Facebook. All Rights Reserved.

#pragma once

#include "defs.h"

extern "C" {
#include <libswresample/swresample.h>
}

namespace ffmpeg {

/**
* Class transcode audio frames from one format into another
*/

class AudioSampler : public MediaSampler {
public:
explicit AudioSampler(void* logCtx);
~AudioSampler() override;

// MediaSampler overrides
bool init(const SamplerParameters& params) override;
int sample(const ByteStorage* in, ByteStorage* out) override;
void shutdown() override;

int getSamplesBytes(AVFrame* frame) const;
int sample(AVFrame* frame, ByteStorage* out);

private:
// close resources
void cleanUp();
// helper functions for rescaling, cropping, etc.
int numOutputSamples(int inSamples) const;
int sample(
const uint8_t* inPlanes[],
int inNumSamples,
ByteStorage* out,
int outNumSamples);

private:
SwrContext* swrContext_{nullptr};
void* logCtx_{nullptr};
};

} // namespace ffmpeg
Loading

0 comments on commit 6049f5b

Please sign in to comment.