Skip to content

Commit

Permalink
Update and fill the rest of ffmpeg-integration C++ code (pytorch#2113)
Browse files Browse the repository at this point in the history
Summary:
- Introduce AudioBuffer and VideoBuffer for different way of handling frames
- Update the way option dictionary is passed
- Remove unused AutoFrameUnref
- Add SrcStreamInfo/OutputStreamInfo classes

Pull Request resolved: pytorch#2113

Reviewed By: nateanl

Differential Revision: D33356144

Pulled By: mthrok

fbshipit-source-id: e837e84fae48baa7befd5c70599bcd2cbb61514d
  • Loading branch information
mthrok authored and xiaohui-zhang committed May 4, 2022
1 parent 52c6f52 commit 822d08e
Show file tree
Hide file tree
Showing 10 changed files with 283 additions and 57 deletions.
164 changes: 141 additions & 23 deletions torchaudio/csrc/ffmpeg/buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,27 @@
namespace torchaudio {
namespace ffmpeg {

Buffer::Buffer(AVMediaType type) : media_type(type) {}
Buffer::Buffer(int frames_per_chunk, int num_chunks)
: frames_per_chunk(frames_per_chunk), num_chunks(num_chunks) {}

AudioBuffer::AudioBuffer(int frames_per_chunk, int num_chunks)
: Buffer(frames_per_chunk, num_chunks) {}

VideoBuffer::VideoBuffer(int frames_per_chunk, int num_chunks)
: Buffer(frames_per_chunk, num_chunks) {}

////////////////////////////////////////////////////////////////////////////////
// Query
////////////////////////////////////////////////////////////////////////////////
bool Buffer::is_ready() const {
if (frames_per_chunk < 0)
return num_buffered_frames > 0;
return num_buffered_frames >= frames_per_chunk;
}

////////////////////////////////////////////////////////////////////////////////
// Modifiers - Push Audio
////////////////////////////////////////////////////////////////////////////////
namespace {
torch::Tensor convert_audio_tensor(AVFrame* pFrame) {
// ref: https://ffmpeg.org/doxygen/4.1/filter__audio_8c_source.html#l00215
Expand Down Expand Up @@ -82,10 +101,64 @@ torch::Tensor convert_audio_tensor(AVFrame* pFrame) {
}
} // namespace

void Buffer::push_audio_frame(AVFrame* pFrame) {
chunks.push_back(convert_audio_tensor(pFrame));
void AudioBuffer::push_tensor(torch::Tensor t) {
// If frames_per_chunk < 0, users want to fetch all frames.
// Just push back to chunks and that's it.
if (frames_per_chunk < 0) {
chunks.push_back(t);
num_buffered_frames += t.size(0);
return;
}

// Push
// Note:
// For audio, the incoming tensor contains multiple of samples.
// For small `frames_per_chunk` value, it might be more than `max_frames`.
// If we push the tensor as-is, then, the whole frame might be popped at
// trimming stage, resulting buffer always empty. So we slice push the
// incoming Tensor.

// Check the last inserted Tensor and if the numbe of frames is not
// frame_per_chunk, reprocess it again with the incomping tensor
if (num_buffered_frames % frames_per_chunk) {
torch::Tensor prev = chunks.back();
chunks.pop_back();
num_buffered_frames -= prev.size(0);
t = torch::cat({prev, t}, 0);
}

while (true) {
int num_input_frames = t.size(0);
if (num_input_frames <= frames_per_chunk) {
chunks.push_back(t);
num_buffered_frames += num_input_frames;
break;
}
// The input tensor contains more frames than frames_per_chunk
auto splits = torch::tensor_split(t, {frames_per_chunk, num_input_frames});
chunks.push_back(splits[0]);
num_buffered_frames += frames_per_chunk;
t = splits[1];
}

// Trim
// If frames_per_chunk > 0, we only retain the following number of frames and
// Discard older frames.
int max_frames = num_chunks * frames_per_chunk;
while (num_buffered_frames > max_frames) {
torch::Tensor& t = chunks.front();
num_buffered_frames -= t.size(0);
chunks.pop_front();
}
}

void AudioBuffer::push_frame(AVFrame* frame) {
push_tensor(convert_audio_tensor(frame));
}

////////////////////////////////////////////////////////////////////////////////
// Modifiers - Push Video
////////////////////////////////////////////////////////////////////////////////
namespace {
torch::Tensor convert_image_tensor(AVFrame* pFrame) {
// ref:
Expand Down Expand Up @@ -130,34 +203,79 @@ torch::Tensor convert_image_tensor(AVFrame* pFrame) {
}
} // namespace

void Buffer::push_video_frame(AVFrame* pFrame) {
chunks.push_back(convert_image_tensor(pFrame));
void VideoBuffer::push_tensor(torch::Tensor t) {
// the video frames is expected to contain only one frame
chunks.push_back(t);
num_buffered_frames += t.size(0);

if (frames_per_chunk < 0) {
return;
}

// Trim
int max_frames = num_chunks * frames_per_chunk;
if (num_buffered_frames > max_frames) {
torch::Tensor& t = chunks.front();
num_buffered_frames -= t.size(0);
chunks.pop_front();
}
}

torch::Tensor Buffer::pop_all() {
if (!chunks.size())
return torch::empty({});
void VideoBuffer::push_frame(AVFrame* frame) {
push_tensor(convert_image_tensor(frame));
}

std::vector<torch::Tensor> tmp;
while (chunks.size()) {
tmp.push_back(chunks.front());
////////////////////////////////////////////////////////////////////////////////
// Modifiers - Pop
////////////////////////////////////////////////////////////////////////////////

using namespace torch::indexing;

c10::optional<torch::Tensor> Buffer::pop_chunk() {
if (!num_buffered_frames) {
return c10::optional<torch::Tensor>{};
}
if (frames_per_chunk < 0) {
return c10::optional<torch::Tensor>{pop_all()};
}
return c10::optional<torch::Tensor>{pop_one_chunk()};
}

torch::Tensor AudioBuffer::pop_one_chunk() {
// Audio deque are aligned with `frames_per_chunk`
torch::Tensor ret = chunks.front();
chunks.pop_front();
num_buffered_frames -= ret.size(0);
return ret;
}

torch::Tensor VideoBuffer::pop_one_chunk() {
// Video deque contains one frame par one tensor
std::vector<torch::Tensor> ret;
while (num_buffered_frames > 0 && ret.size() < frames_per_chunk) {
torch::Tensor& t = chunks.front();
ret.push_back(t);
chunks.pop_front();
num_buffered_frames -= 1;
}
return torch::cat(tmp, 0);
return torch::cat(ret, 0);
}

void Buffer::push_frame(AVFrame* frame) {
switch (media_type) {
case AVMEDIA_TYPE_AUDIO:
push_audio_frame(frame);
break;
case AVMEDIA_TYPE_VIDEO:
push_video_frame(frame);
break;
default:
throw std::runtime_error(
"Unexpected media type. Only audio/video is supported.");
torch::Tensor Buffer::pop_all() {
// Note:
// This method is common to audio/video.
// In audio case, each Tensor contains multiple frames
// In video case, each Tensor contains one frame,
std::vector<torch::Tensor> ret;
while (chunks.size()) {
torch::Tensor& t = chunks.front();
int n_frames = t.size(0);
ret.push_back(t);
chunks.pop_front();
num_buffered_frames -= n_frames;
}
return torch::cat(ret, 0);
}

} // namespace ffmpeg
} // namespace torchaudio
74 changes: 69 additions & 5 deletions torchaudio/csrc/ffmpeg/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,82 @@ namespace torchaudio {
namespace ffmpeg {

class Buffer {
protected:
// Each AVFrame is converted to a Tensor and stored here.
std::deque<torch::Tensor> chunks;
AVMediaType media_type;

void push_audio_frame(AVFrame* pFrame);
void push_video_frame(AVFrame* pFrame);
// The number of frames to return as a chunk
// If <0, then user wants to receive all the frames
const int frames_per_chunk;
// The numbe of chunks to retain
const int num_chunks;
// The number of currently stored chunks
// For video, one Tensor corresponds to one frame, but for audio,
// one Tensor contains multiple samples, so we track here.
int num_buffered_frames = 0;

public:
Buffer(AVMediaType type);
Buffer(int frames_per_chunk, int num_chunks);
virtual ~Buffer() = default;

void push_frame(AVFrame* pFrame);
//////////////////////////////////////////////////////////////////////////////
// Query
//////////////////////////////////////////////////////////////////////////////
// Check if buffeer has enoough number of frames for a chunk
// If frame_per_chunk <0, returns true if there is >0 frames.
// Otherwise, returns if num_frames >= frame_per_chunk.
bool is_ready() const;

//////////////////////////////////////////////////////////////////////////////
// Modifiers
//////////////////////////////////////////////////////////////////////////////
virtual void push_frame(AVFrame* frame) = 0;

c10::optional<torch::Tensor> pop_chunk();

private:
virtual torch::Tensor pop_one_chunk() = 0;
torch::Tensor pop_all();
};

// Specialization of the handling around push/pop for audio/video.

////////////////////////////////////////////////////////////////////////////////
// AudioBuffer specialization
////////////////////////////////////////////////////////////////////////////////
// For audio, input AVFrame contains multiple frames.
// When popping the buffered frames chunk-by-chunk, it is easier if they are
// organized by chunk when pushed to deque object.
// Therefore, audio implements pushing mechanism that makes sure that
// each Tensor in deque consists Tensors with `frames_per_chunk` frames.
class AudioBuffer : public Buffer {
public:
AudioBuffer(int frames_per_chunk, int num_chunks);

void push_frame(AVFrame* frame);

private:
void push_tensor(torch::Tensor tensor);
torch::Tensor pop_one_chunk();
};

////////////////////////////////////////////////////////////////////////////////
// VideoBuffer specialization
////////////////////////////////////////////////////////////////////////////////
// For video, input AVFrame contains one frame.
// Contraty to audio, it is simple to push one frame each time to deque.
// But this mean that chunks consisting of multiple frames have to be created
// at popping time.
class VideoBuffer : public Buffer {
public:
VideoBuffer(int frames_per_chunk, int num_chunks);

void push_frame(AVFrame* frame);

private:
void push_tensor(torch::Tensor tensor);
torch::Tensor pop_one_chunk();
};

} // namespace ffmpeg
} // namespace torchaudio
25 changes: 11 additions & 14 deletions torchaudio/csrc/ffmpeg/ffmpeg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,20 @@ namespace {
AVFormatContext* get_format_context(
const std::string& src,
const std::string& device,
AVDictionary** option) {
const std::map<std::string, std::string>& option) {
AVFormatContext* pFormat = NULL;
AVInputFormat* pInput =
device.empty() ? NULL : av_find_input_format(device.c_str());

if (avformat_open_input(&pFormat, src.c_str(), pInput, option) < 0)
AVDictionary* dict = NULL;
for (auto& it : option) {
av_dict_set(&dict, it.first.c_str(), it.second.c_str(), 0);
}

int ret = avformat_open_input(&pFormat, src.c_str(), pInput, &dict);
av_dict_free(&dict);

if (ret < 0)
throw std::runtime_error("Failed to open the input: " + src);
return pFormat;
}
Expand All @@ -28,7 +36,7 @@ AVFormatContext* get_format_context(
AVFormatContextPtr::AVFormatContextPtr(
const std::string& src,
const std::string& device,
AVDictionary** option)
const std::map<std::string, std::string>& option)
: Wrapper<AVFormatContext, AVFormatContextDeleter>(
get_format_context(src, device, option)) {
if (avformat_find_stream_info(ptr.get(), NULL) < 0)
Expand Down Expand Up @@ -82,17 +90,6 @@ AVFrame* get_av_frame() {

AVFramePtr::AVFramePtr() : Wrapper<AVFrame, AVFrameDeleter>(get_av_frame()) {}

///////////////////////////////////////////////////////////////////////////////
// AVFrame - buffer unref
////////////////////////////////////////////////////////////////////////////////
AutoFrameUnref::AutoFrameUnref(AVFramePtr& p) : p_(p){};
AutoFrameUnref::~AutoFrameUnref() {
av_frame_unref(p_);
}
AutoFrameUnref::operator AVFrame*() const {
return p_;
}

////////////////////////////////////////////////////////////////////////////////
// AVCodecContext
////////////////////////////////////////////////////////////////////////////////
Expand Down
15 changes: 2 additions & 13 deletions torchaudio/csrc/ffmpeg/ffmpeg.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// One stop header for all ffmepg needs
#pragma once
#include <cstdint>
#include <map>
#include <memory>
#include <string>

Expand Down Expand Up @@ -58,7 +59,7 @@ struct AVFormatContextPtr
AVFormatContextPtr(
const std::string& src,
const std::string& device,
AVDictionary** option);
const std::map<std::string, std::string>& option);
};

////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -101,18 +102,6 @@ struct AVFramePtr : public Wrapper<AVFrame, AVFrameDeleter> {
AVFramePtr();
};

////////////////////////////////////////////////////////////////////////////////
// AVFrame - buffer unref
////////////////////////////////////////////////////////////////////////////////
// Similar to `AutoPacketUnref`, this structure will release the memory
// allocated for frame content.
struct AutoFrameUnref {
AVFramePtr& p_;
AutoFrameUnref(AVFramePtr& p);
~AutoFrameUnref();
operator AVFrame*() const;
};

////////////////////////////////////////////////////////////////////////////////
// AVCodecContext
////////////////////////////////////////////////////////////////////////////////
Expand Down
7 changes: 7 additions & 0 deletions torchaudio/csrc/ffmpeg/filter_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ FilterGraph::FilterGraph(
create_filter();
}

////////////////////////////////////////////////////////////////////////////////
// Query method
////////////////////////////////////////////////////////////////////////////////
std::string FilterGraph::get_description() const {
return filter_description;
};

////////////////////////////////////////////////////////////////////////////////
// Configuration methods
////////////////////////////////////////////////////////////////////////////////
Expand Down
Loading

0 comments on commit 822d08e

Please sign in to comment.