Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OpenVINO support #1037

Merged
merged 10 commits into from
Jul 4, 2023
16 changes: 16 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
option(WHISPER_NO_FMA "whisper: disable FMA" OFF)
option(WHISPER_NO_F16C "whisper: disable F16c" OFF)

option(WHISPER_OPENVINO "whisper: support for OpenVINO" OFF)

if (APPLE)
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
Expand Down Expand Up @@ -192,6 +194,10 @@ if (WHISPER_CLBLAST)
endif()
endif()

if( WHISPER_OPENVINO )
find_package(OpenVINO REQUIRED COMPONENTS Runtime)
endif()

# compiler flags

if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
Expand Down Expand Up @@ -297,6 +303,11 @@ if (WHISPER_COREML)
)
endif()

if (WHISPER_OPENVINO)
set( OpenVINO_SOURCES openvino/whisper-openvino-encoder.h openvino/whisper-openvino-encoder.cpp )
set( WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_OPENVINO)
endif()

#
# whisper - this is the main library of the project
#
Expand All @@ -310,6 +321,7 @@ add_library(${TARGET}
${GGML_OPENCL_SOURCES}
whisper.h
whisper.cpp
${OpenVINO_SOURCES}
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use OPENVINO_SOURCES

However, why not make a separate target whisper.openvino similar to how whisper.coreml works?

Copy link
Contributor Author

@RyanMetcalfeInt8 RyanMetcalfeInt8 Jun 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me try it again. I had originally tried to add it as a separate target and had some weird issues (something like the corresponding .Lib wasn't being generated in Windows build)-- I intended to circle back though, so thanks for the reminder.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

okay, see latest commit (76c4186)

I added openvino-encoder to dedicated OBJECT target:

add_library(${TARGET} OBJECT
        openvino/whisper-openvino-encoder.h
        openvino/whisper-openvino-encoder.cpp
        )

And this target is linked to whisper just like coreml:

if (WHISPER_OPENVINO)
    target_link_libraries(${TARGET} PRIVATE whisper.openvino)
endif()

I was thinking of making it SHARED, but I think it'd be more of a hassle to have to carry around a separate .dll / .so..

This builds fine, and did some minimal testing on Windows 11 & Ubuntu.

)

include(DefaultTargetOptions)
Expand All @@ -322,6 +334,10 @@ if (WHISPER_COREML)
target_link_libraries(${TARGET} PRIVATE whisper.coreml)
endif()

if (WHISPER_OPENVINO)
target_link_libraries(${TARGET} PRIVATE openvino::runtime)
endif()

if (MSVC)
target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})

Expand Down
7 changes: 7 additions & 0 deletions examples/main/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ struct whisper_params {
std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
std::string model = "models/ggml-base.en.bin";

std::string openvino_encode_device = "CPU";

std::vector<std::string> fname_inp = {};
std::vector<std::string> fname_out = {};
};
Expand Down Expand Up @@ -146,6 +148,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if ( arg == "--prompt") { params.prompt = argv[++i]; }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
Expand Down Expand Up @@ -197,6 +200,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] The OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
fprintf(stderr, "\n");
}

Expand Down Expand Up @@ -713,6 +717,9 @@ int main(int argc, char ** argv) {
return 3;
}

// initialize openvino encoder. This has no effect on whisper.cpp builds that don't have OpenVINO configured.
whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);

for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
const auto fname_inp = params.fname_inp[f];
const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
Expand Down
53 changes: 53 additions & 0 deletions models/convert-whisper-to-openvino.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import argparse
import torch
from whisper import load_model
import os
from openvino.tools import mo
from openvino.runtime import serialize
import shutil

def convert_encoder(hparams, encoder, mname):
encoder.eval()

mel = torch.zeros((1, 80, 3000))

onnx_folder=os.path.join(os.path.dirname(__file__),"onnx_encoder")

#create a directory to store the onnx model, and other collateral that is saved during onnx export procedure
if not os.path.isdir(onnx_folder):
os.makedirs(onnx_folder)

onnx_path = os.path.join(onnx_folder, "whisper_encoder.onnx")

torch.onnx.export(
encoder,
mel,
onnx_path,
input_names=["mel"],
output_names=["output_features"]
)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's not required to export to ONNX before usage in OpenVINO.
You can use convert_model with PyTorch in-memory object https://docs.openvino.ai/2023.1/openvino_docs_OV_Converter_UG_prepare_model_convert_model_Convert_Model_From_PyTorch.html


# use model optimizer to convert onnx to OpenVINO IR format
encoder_model = mo.convert_model(onnx_path, compress_to_fp16=True)
serialize(encoder_model, xml_path='ggml-' + mname + '-encoder-openvino.xml')

#cleanup
if os.path.isdir(onnx_folder):
shutil.rmtree(onnx_folder)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
args = parser.parse_args()

if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
raise ValueError("Invalid model name")

whisper = load_model(args.model).cpu()
hparams = whisper.dims

encoder = whisper.encoder

# Convert encoder to onnx
convert_encoder(hparams, encoder, args.model)
2 changes: 2 additions & 0 deletions models/openvino-conversion-requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
openvino-dev[pytorch,onnx]

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we can use openvino>=2023.1.0 which contains update version of convert_model directly in main openvino pip package, while openvino-dev is actually deprecated.

openai-whisper
108 changes: 108 additions & 0 deletions openvino/whisper-openvino-encoder.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#include "openvino/whisper-openvino-encoder.h"
#include "ggml.h"
#include <openvino/openvino.hpp>
#include <iostream>

struct whisper_openvino_context {
ov::InferRequest inferRequest;
};

struct whisper_openvino_context * whisper_openvino_init(const char* path_model,
const char* device,
const char* cache_dir)
{
if (!path_model || !device) {
fprintf(stderr, "%s: path_model and/or device is null\n", __func__);
return nullptr;
}

fprintf(stderr, "%s: path_model = %s, device = %s, cache_dir = %s\n",
__func__, path_model, device, cache_dir ? cache_dir : "(not set)");

whisper_openvino_context *context = new whisper_openvino_context;
try {
ov::Core core;

if (cache_dir) {
// enables caching of device-specific 'blobs' during core.compile_model
// routine. This speeds up calls to compile_model for successive runs.
core.set_property(ov::cache_dir(cache_dir));
}

//Read the OpenVINO encoder IR (.xml/.bin) from disk, producing an ov::Model object.
std::shared_ptr<ov::Model> model = core.read_model(path_model);

// Produce a compiled-model object, given the device ("CPU", "GPU", etc.)
auto compiledModel = core.compile_model(model, device);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can pass path_model directly to compile_model, which can speed-up loading with ov::cache_dir enabled. See https://docs.openvino.ai/2023.1/openvino_docs_OV_UG_Model_caching_overview.html#make-it-even-faster-use-compile-model-modelpath

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any practical speedup from this change?

I'm on OpenVINO 2022.3.1 for device which is EOL'ed. I can compile master and run it with cache:

whisper_openvino_init: path_model = models/ggml-base.en-encoder-openvino.xml, device = MYRIAD, cache_dir = models/ggml-base.en-encoder-openvino-cache

The speed is on par with CPU/GPU OpenVINO. And it helps RPi to inference on base model.

Copy link
Contributor Author

@RyanMetcalfeInt8 RyanMetcalfeInt8 Dec 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably some yes, but the speedup will be during initialization (i.e. the time it takes to pull the model / cached blob from disk and prep the execution device).


// From the compiled model object, create an infer request. This is the thing that we
// we will use later on to trigger inference execution.
context->inferRequest = compiledModel.create_infer_request();
}
catch (const std::exception& error) {
std::cout << "in openvino encoder compile routine: exception: " << error.what() << std::endl;
delete context;
context = nullptr;
}

return context;
}

void whisper_openvino_free(struct whisper_openvino_context * ctx) {
if( ctx ) {
delete ctx;
}
}

int whisper_openvino_encode(
whisper_openvino_context* ctx,
ggml_tensor* mel,
ggml_tensor* out) {

if (!ctx || !mel || !out) {
fprintf(stderr, "%s: Error! ctx / mel / out is null\n", __func__);
return 0;
}

if (mel->n_dims != 2) {
fprintf(stderr, "%s: Error! mel ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",
__func__, mel->n_dims);
return 0;
}

if (out->n_dims != 2) {
fprintf(stderr, "%s: Error! out ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",
__func__, out->n_dims);
return 0;
}

try {

//wrap the passed-in mel ggml_tensor as an OpenVINO Tensor object, and set as input tensor to infer request
{
// note, we populate shape & stride dimensions in opposite order from how they are listed in ne / nb arrays
ov::Shape input_shape = { 1, (unsigned long long)mel->ne[1], (unsigned long long)mel->ne[0] };
ov::Strides input_strides = { mel->nb[2], mel->nb[1], mel->nb[0] };
ov::Tensor input_tensor(ov::element::f32, input_shape, mel->data, input_strides);
ctx->inferRequest.set_input_tensor(input_tensor);
}

//wrap the passed-in out ggml_tensor as an OpenVINO Tensor object, and set as output tensor to infer request
{
// note, we populate shape & stride dimensions in opposite order from how they are listed in ne / nb arrays
ov::Shape output_shape = { 1, (unsigned long long)out->ne[1], (unsigned long long)out->ne[0] };
ov::Strides output_strides = { out->nb[2], out->nb[1], out->nb[0] };
ov::Tensor out_tensor(ov::element::f32, output_shape, out->data, output_strides);
ctx->inferRequest.set_output_tensor(out_tensor);
}

//run inference
ctx->inferRequest.infer();
}
catch (const std::exception& error) {
std::cout << "in openvino encode inference execution routine: exception: " << error.what() << std::endl;
return 0;
}

return 1;
}
31 changes: 31 additions & 0 deletions openvino/whisper-openvino-encoder.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// Wrapper of the OpenVINO Whisper Encoder model
//

#if __cplusplus
extern "C" {
#endif

struct whisper_openvino_context;

// initialize openvino encoder, given path to model xml, device ("CPU", "GPU", etc.), and
// path to cache_dir. Returns null upon failure.
struct whisper_openvino_context * whisper_openvino_init(const char * path_model,
const char * device,
const char * cache_dir);

// clean up a ctx previously returned from whisper_openvino_init()
void whisper_openvino_free(struct whisper_openvino_context * ctx);

struct ggml_tensor;

// Perform encode using OpenVINO.
// Returns 1 on success
// Returns 0 on failure
int whisper_openvino_encode(
whisper_openvino_context* ctx,
ggml_tensor* mel,
ggml_tensor* out);

#if __cplusplus
}
#endif
Loading