Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update and fill the rest of ffmpeg-integration C++ code #2113

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 141 additions & 23 deletions torchaudio/csrc/ffmpeg/buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,27 @@
namespace torchaudio {
namespace ffmpeg {

Buffer::Buffer(AVMediaType type) : media_type(type) {}
Buffer::Buffer(int frames_per_chunk, int num_chunks)
: frames_per_chunk(frames_per_chunk), num_chunks(num_chunks) {}

AudioBuffer::AudioBuffer(int frames_per_chunk, int num_chunks)
: Buffer(frames_per_chunk, num_chunks) {}

VideoBuffer::VideoBuffer(int frames_per_chunk, int num_chunks)
: Buffer(frames_per_chunk, num_chunks) {}

////////////////////////////////////////////////////////////////////////////////
// Query
////////////////////////////////////////////////////////////////////////////////
bool Buffer::is_ready() const {
if (frames_per_chunk < 0)
return num_buffered_frames > 0;
return num_buffered_frames >= frames_per_chunk;
}

////////////////////////////////////////////////////////////////////////////////
// Modifiers - Push Audio
////////////////////////////////////////////////////////////////////////////////
namespace {
torch::Tensor convert_audio_tensor(AVFrame* pFrame) {
// ref: https://ffmpeg.org/doxygen/4.1/filter__audio_8c_source.html#l00215
Expand Down Expand Up @@ -82,10 +101,64 @@ torch::Tensor convert_audio_tensor(AVFrame* pFrame) {
}
} // namespace

void Buffer::push_audio_frame(AVFrame* pFrame) {
chunks.push_back(convert_audio_tensor(pFrame));
void AudioBuffer::push_tensor(torch::Tensor t) {
// If frames_per_chunk < 0, users want to fetch all frames.
// Just push back to chunks and that's it.
if (frames_per_chunk < 0) {
chunks.push_back(t);
num_buffered_frames += t.size(0);
return;
}

// Push
// Note:
// For audio, the incoming tensor contains multiple of samples.
// For small `frames_per_chunk` value, it might be more than `max_frames`.
// If we push the tensor as-is, then, the whole frame might be popped at
// trimming stage, resulting buffer always empty. So we slice push the
// incoming Tensor.

// Check the last inserted Tensor and if the numbe of frames is not
// frame_per_chunk, reprocess it again with the incomping tensor
if (num_buffered_frames % frames_per_chunk) {
torch::Tensor prev = chunks.back();
chunks.pop_back();
num_buffered_frames -= prev.size(0);
t = torch::cat({prev, t}, 0);
}

while (true) {
int num_input_frames = t.size(0);
if (num_input_frames <= frames_per_chunk) {
chunks.push_back(t);
num_buffered_frames += num_input_frames;
break;
}
// The input tensor contains more frames than frames_per_chunk
auto splits = torch::tensor_split(t, {frames_per_chunk, num_input_frames});
chunks.push_back(splits[0]);
num_buffered_frames += frames_per_chunk;
t = splits[1];
}

// Trim
// If frames_per_chunk > 0, we only retain the following number of frames and
// Discard older frames.
int max_frames = num_chunks * frames_per_chunk;
while (num_buffered_frames > max_frames) {
torch::Tensor& t = chunks.front();
num_buffered_frames -= t.size(0);
chunks.pop_front();
}
}

void AudioBuffer::push_frame(AVFrame* frame) {
push_tensor(convert_audio_tensor(frame));
}

////////////////////////////////////////////////////////////////////////////////
// Modifiers - Push Video
////////////////////////////////////////////////////////////////////////////////
namespace {
torch::Tensor convert_image_tensor(AVFrame* pFrame) {
// ref:
Expand Down Expand Up @@ -130,34 +203,79 @@ torch::Tensor convert_image_tensor(AVFrame* pFrame) {
}
} // namespace

void Buffer::push_video_frame(AVFrame* pFrame) {
chunks.push_back(convert_image_tensor(pFrame));
void VideoBuffer::push_tensor(torch::Tensor t) {
// the video frames is expected to contain only one frame
chunks.push_back(t);
num_buffered_frames += t.size(0);

if (frames_per_chunk < 0) {
return;
}

// Trim
int max_frames = num_chunks * frames_per_chunk;
if (num_buffered_frames > max_frames) {
torch::Tensor& t = chunks.front();
num_buffered_frames -= t.size(0);
chunks.pop_front();
}
}

torch::Tensor Buffer::pop_all() {
if (!chunks.size())
return torch::empty({});
void VideoBuffer::push_frame(AVFrame* frame) {
push_tensor(convert_image_tensor(frame));
}

std::vector<torch::Tensor> tmp;
while (chunks.size()) {
tmp.push_back(chunks.front());
////////////////////////////////////////////////////////////////////////////////
// Modifiers - Pop
////////////////////////////////////////////////////////////////////////////////

using namespace torch::indexing;

c10::optional<torch::Tensor> Buffer::pop_chunk() {
if (!num_buffered_frames) {
return c10::optional<torch::Tensor>{};
}
if (frames_per_chunk < 0) {
return c10::optional<torch::Tensor>{pop_all()};
}
return c10::optional<torch::Tensor>{pop_one_chunk()};
}

torch::Tensor AudioBuffer::pop_one_chunk() {
// Audio deque are aligned with `frames_per_chunk`
torch::Tensor ret = chunks.front();
chunks.pop_front();
num_buffered_frames -= ret.size(0);
return ret;
}

torch::Tensor VideoBuffer::pop_one_chunk() {
// Video deque contains one frame par one tensor
std::vector<torch::Tensor> ret;
while (num_buffered_frames > 0 && ret.size() < frames_per_chunk) {
torch::Tensor& t = chunks.front();
ret.push_back(t);
chunks.pop_front();
num_buffered_frames -= 1;
}
return torch::cat(tmp, 0);
return torch::cat(ret, 0);
}

void Buffer::push_frame(AVFrame* frame) {
switch (media_type) {
case AVMEDIA_TYPE_AUDIO:
push_audio_frame(frame);
break;
case AVMEDIA_TYPE_VIDEO:
push_video_frame(frame);
break;
default:
throw std::runtime_error(
"Unexpected media type. Only audio/video is supported.");
torch::Tensor Buffer::pop_all() {
// Note:
// This method is common to audio/video.
// In audio case, each Tensor contains multiple frames
// In video case, each Tensor contains one frame,
std::vector<torch::Tensor> ret;
while (chunks.size()) {
torch::Tensor& t = chunks.front();
int n_frames = t.size(0);
ret.push_back(t);
chunks.pop_front();
num_buffered_frames -= n_frames;
}
return torch::cat(ret, 0);
}

} // namespace ffmpeg
} // namespace torchaudio
74 changes: 69 additions & 5 deletions torchaudio/csrc/ffmpeg/buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,82 @@ namespace torchaudio {
namespace ffmpeg {

class Buffer {
protected:
// Each AVFrame is converted to a Tensor and stored here.
std::deque<torch::Tensor> chunks;
AVMediaType media_type;

void push_audio_frame(AVFrame* pFrame);
void push_video_frame(AVFrame* pFrame);
// The number of frames to return as a chunk
// If <0, then user wants to receive all the frames
const int frames_per_chunk;
// The numbe of chunks to retain
const int num_chunks;
// The number of currently stored chunks
// For video, one Tensor corresponds to one frame, but for audio,
// one Tensor contains multiple samples, so we track here.
int num_buffered_frames = 0;

public:
Buffer(AVMediaType type);
Buffer(int frames_per_chunk, int num_chunks);
virtual ~Buffer() = default;

void push_frame(AVFrame* pFrame);
//////////////////////////////////////////////////////////////////////////////
// Query
//////////////////////////////////////////////////////////////////////////////
// Check if buffeer has enoough number of frames for a chunk
// If frame_per_chunk <0, returns true if there is >0 frames.
// Otherwise, returns if num_frames >= frame_per_chunk.
bool is_ready() const;

//////////////////////////////////////////////////////////////////////////////
// Modifiers
//////////////////////////////////////////////////////////////////////////////
virtual void push_frame(AVFrame* frame) = 0;

c10::optional<torch::Tensor> pop_chunk();

private:
virtual torch::Tensor pop_one_chunk() = 0;
torch::Tensor pop_all();
};

// Specialization of the handling around push/pop for audio/video.

////////////////////////////////////////////////////////////////////////////////
// AudioBuffer specialization
////////////////////////////////////////////////////////////////////////////////
// For audio, input AVFrame contains multiple frames.
// When popping the buffered frames chunk-by-chunk, it is easier if they are
// organized by chunk when pushed to deque object.
// Therefore, audio implements pushing mechanism that makes sure that
// each Tensor in deque consists Tensors with `frames_per_chunk` frames.
class AudioBuffer : public Buffer {
public:
AudioBuffer(int frames_per_chunk, int num_chunks);

void push_frame(AVFrame* frame);

private:
void push_tensor(torch::Tensor tensor);
torch::Tensor pop_one_chunk();
};

////////////////////////////////////////////////////////////////////////////////
// VideoBuffer specialization
////////////////////////////////////////////////////////////////////////////////
// For video, input AVFrame contains one frame.
// Contraty to audio, it is simple to push one frame each time to deque.
// But this mean that chunks consisting of multiple frames have to be created
// at popping time.
class VideoBuffer : public Buffer {
public:
VideoBuffer(int frames_per_chunk, int num_chunks);

void push_frame(AVFrame* frame);

private:
void push_tensor(torch::Tensor tensor);
torch::Tensor pop_one_chunk();
};

} // namespace ffmpeg
} // namespace torchaudio
25 changes: 11 additions & 14 deletions torchaudio/csrc/ffmpeg/ffmpeg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,20 @@ namespace {
AVFormatContext* get_format_context(
const std::string& src,
const std::string& device,
AVDictionary** option) {
const std::map<std::string, std::string>& option) {
AVFormatContext* pFormat = NULL;
AVInputFormat* pInput =
device.empty() ? NULL : av_find_input_format(device.c_str());

if (avformat_open_input(&pFormat, src.c_str(), pInput, option) < 0)
AVDictionary* dict = NULL;
for (auto& it : option) {
av_dict_set(&dict, it.first.c_str(), it.second.c_str(), 0);
}

int ret = avformat_open_input(&pFormat, src.c_str(), pInput, &dict);
av_dict_free(&dict);

if (ret < 0)
throw std::runtime_error("Failed to open the input: " + src);
return pFormat;
}
Expand All @@ -28,7 +36,7 @@ AVFormatContext* get_format_context(
AVFormatContextPtr::AVFormatContextPtr(
const std::string& src,
const std::string& device,
AVDictionary** option)
const std::map<std::string, std::string>& option)
: Wrapper<AVFormatContext, AVFormatContextDeleter>(
get_format_context(src, device, option)) {
if (avformat_find_stream_info(ptr.get(), NULL) < 0)
Expand Down Expand Up @@ -82,17 +90,6 @@ AVFrame* get_av_frame() {

AVFramePtr::AVFramePtr() : Wrapper<AVFrame, AVFrameDeleter>(get_av_frame()) {}

///////////////////////////////////////////////////////////////////////////////
// AVFrame - buffer unref
////////////////////////////////////////////////////////////////////////////////
AutoFrameUnref::AutoFrameUnref(AVFramePtr& p) : p_(p){};
AutoFrameUnref::~AutoFrameUnref() {
av_frame_unref(p_);
}
AutoFrameUnref::operator AVFrame*() const {
return p_;
}

////////////////////////////////////////////////////////////////////////////////
// AVCodecContext
////////////////////////////////////////////////////////////////////////////////
Expand Down
15 changes: 2 additions & 13 deletions torchaudio/csrc/ffmpeg/ffmpeg.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// One stop header for all ffmepg needs
#pragma once
#include <cstdint>
#include <map>
#include <memory>
#include <string>

Expand Down Expand Up @@ -58,7 +59,7 @@ struct AVFormatContextPtr
AVFormatContextPtr(
const std::string& src,
const std::string& device,
AVDictionary** option);
const std::map<std::string, std::string>& option);
};

////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -101,18 +102,6 @@ struct AVFramePtr : public Wrapper<AVFrame, AVFrameDeleter> {
AVFramePtr();
};

////////////////////////////////////////////////////////////////////////////////
// AVFrame - buffer unref
////////////////////////////////////////////////////////////////////////////////
// Similar to `AutoPacketUnref`, this structure will release the memory
// allocated for frame content.
struct AutoFrameUnref {
AVFramePtr& p_;
AutoFrameUnref(AVFramePtr& p);
~AutoFrameUnref();
operator AVFrame*() const;
};

////////////////////////////////////////////////////////////////////////////////
// AVCodecContext
////////////////////////////////////////////////////////////////////////////////
Expand Down
7 changes: 7 additions & 0 deletions torchaudio/csrc/ffmpeg/filter_graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,13 @@ FilterGraph::FilterGraph(
create_filter();
}

////////////////////////////////////////////////////////////////////////////////
// Query method
////////////////////////////////////////////////////////////////////////////////
std::string FilterGraph::get_description() const {
return filter_description;
};

////////////////////////////////////////////////////////////////////////////////
// Configuration methods
////////////////////////////////////////////////////////////////////////////////
Expand Down
Loading