Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into tj/rtti/helper-ma…
Browse files Browse the repository at this point in the history
…cro/base
  • Loading branch information
t-jankowski committed Feb 13, 2025
2 parents 9f259a9 + b8cdf8c commit c97d9f1
Show file tree
Hide file tree
Showing 72 changed files with 1,227 additions and 210 deletions.
35 changes: 30 additions & 5 deletions docs/articles_en/assets/snippets/ov_caching.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,12 +61,36 @@ bool cachingSupported = std::find(caps.begin(), caps.end(), ov::device::capabili
}

void part4() {
std::string modelPath = "/tmp/myModel.xml";
std::string device = "GPU";
ov::Core core; // Step 1: create ov::Core object
bool hasGPU = false; // Step 1a: Check if GPU is available
auto devices = core.get_available_devices();
for (auto&& supported : devices) {
hasGPU |= supported.find(device) != std::string::npos;
}
if(!hasGPU) {
return;
}
core.set_property(ov::cache_dir("/path/to/cache/dir")); // Step 1b: Enable caching
//! [ov:caching:part4]
// Note: model path needs to point to the *.xml file, not *.bin when using the IR model format.
auto compiled = core.compile_model(modelPath,
device,
ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
//! [ov:caching:part4]
if (!compiled) {
throw std::runtime_error("error");
}
}

void part5() {
std::string modelPath = "/tmp/myModel.xml";
std::string device = "CPU";
ov::Core core; // Step 1: create ov::Core object
core.set_property(ov::cache_dir("/path/to/cache/dir")); // Step 1b: Enable caching
auto model = core.read_model(modelPath); // Step 2: Read Model
//! [ov:caching:part4]
//! [ov:caching:part5]
ov::AnyMap config;
ov::EncryptionCallbacks encryption_callbacks;
static const char codec_key[] = {0x30, 0x60, 0x70, 0x02, 0x04, 0x08, 0x3F, 0x6F, 0x72, 0x74, 0x78, 0x7F};
Expand All @@ -84,13 +108,13 @@ encryption_callbacks.encrypt = codec_xor;
encryption_callbacks.decrypt = codec_xor;
config.insert(ov::cache_encryption_callbacks(encryption_callbacks)); // Step 4: Set device configuration
auto compiled = core.compile_model(model, device, config); // Step 5: LoadNetwork
//! [ov:caching:part4]
//! [ov:caching:part5]
if (!compiled) {
throw std::runtime_error("error");
}
}

void part5() {
void part6() {
std::string modelPath = "/tmp/myModel.xml";
std::string device = "GPU";
ov::Core core; // Step 1: create ov::Core object
Expand All @@ -103,7 +127,7 @@ void part5() {
return;
}
core.set_property(ov::cache_dir("/path/to/cache/dir")); // Step 1b: Enable caching
//! [ov:caching:part5]
//! [ov:caching:part6]
static const char codec_key[] = {0x30, 0x60, 0x70, 0x02, 0x04, 0x08, 0x3F, 0x6F, 0x72, 0x74, 0x78, 0x7F};
auto codec_xor = [&](const std::string& source_str) {
auto key_size = sizeof(codec_key);
Expand All @@ -119,7 +143,7 @@ auto compiled = core.compile_model(modelPath,
device,
ov::cache_encryption_callbacks(ov::EncryptionCallbacks{codec_xor, codec_xor}),
ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE)); // Step 5: Compile model
//! [ov:caching:part5]
//! [ov:caching:part6]
if (!compiled) {
throw std::runtime_error("error");
}
Expand All @@ -133,6 +157,7 @@ int main() {
part3();
part4();
part5();
part6();
} catch (...) {
}
return 0;
Expand Down
16 changes: 13 additions & 3 deletions docs/articles_en/assets/snippets/ov_caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,16 @@
# ! [ov:caching:part3]

# ! [ov:caching:part4]
core = ov.Core()
if "GPU" in core.available_devices:
core.set_property({props.cache_dir: path_to_cache_dir})
config_cache = {}
config_cache["CACHE_MODE"] = "OPTIMIZE_SIZE"
# Note: model path needs to point to the *.xml file, not *.bin when using the IR model format.
compiled_model = core.compile_model(model=model_path, device_name='GPU', config=config_cache)
# ! [ov:caching:part4]

# ! [ov:caching:part5]
import base64

def encrypt_base64(src):
Expand All @@ -58,9 +68,9 @@ def decrypt_base64(src):
config_cache["CACHE_ENCRYPTION_CALLBACKS"] = [encrypt_base64, decrypt_base64]
model = core.read_model(model=model_path)
compiled_model = core.compile_model(model=model, device_name=device_name, config=config_cache)
# ! [ov:caching:part4]

# ! [ov:caching:part5]

# ! [ov:caching:part6]
import base64

def encrypt_base64(src):
Expand All @@ -76,4 +86,4 @@ def decrypt_base64(src):
config_cache["CACHE_ENCRYPTION_CALLBACKS"] = [encrypt_base64, decrypt_base64]
config_cache["CACHE_MODE"] = "OPTIMIZE_SIZE"
compiled_model = core.compile_model(model=model_path, device_name='GPU', config=config_cache)
# ! [ov:caching:part5]
# ! [ov:caching:part6]
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,35 @@ model caching, use the following code in your application:
:language: cpp
:fragment: [ov:caching:part3]

Set ``CacheMode`` property to ``OPTIMIZE_SIZE`` to enable weightless caching
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Weightless caching is a feature that allows you to create a cache file which doesn't contain the weights of the model. Instead, the weights are loaded from the original model file. This helps to reduce the size of the cache file.

.. tab-set::

.. tab-item:: Python
:sync: py

.. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.py
:language: py
:fragment: [ov:caching:part4]

.. tab-item:: C++
:sync: cpp

.. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.cpp
:language: cpp
:fragment: [ov:caching:part4]

.. important::

Currently, this property is supported only by the GPU Plugin and IR model format.

.. important::

Some weights which undergo transformations during model compilation may not be eligible for weightless caching. In such cases, the cache file will contain these weights while still using the weightless caching mechanism for the rest. The feature supports some of the common transformations and replicates them after loading the model from the cache.

Enable cache encryption
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Expand All @@ -154,16 +183,16 @@ loading it from the cache. Currently, this property can be set only in ``compile

.. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.py
:language: py
:fragment: [ov:caching:part4]
:fragment: [ov:caching:part5]

.. tab-item:: C++
:sync: cpp

.. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.cpp
:language: cpp
:fragment: [ov:caching:part4]
:fragment: [ov:caching:part5]

Full encryption only works when the ``CacheMode`` property is set to ``OPTIMIZE_SIZE``.
If model caching is enabled in the GPU Plugin, the model topology can be encrypted while it is saved to the cache and decrypted when it is loaded from the cache. Full encryption only works when the ``CacheMode`` property is set to ``OPTIMIZE_SIZE``.

.. tab-set::

Expand All @@ -172,14 +201,14 @@ Full encryption only works when the ``CacheMode`` property is set to ``OPTIMIZE_

.. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.py
:language: py
:fragment: [ov:caching:part5]
:fragment: [ov:caching:part6]

.. tab-item:: C++
:sync: cpp

.. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.cpp
:language: cpp
:fragment: [ov:caching:part5]
:fragment: [ov:caching:part6]

.. important::

Expand Down
4 changes: 2 additions & 2 deletions src/bindings/python/src/openvino/runtime/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@
# noqa: F401

import warnings
warnings.simplefilter("always", DeprecationWarning)
warnings.filterwarnings("once", category=DeprecationWarning, module="openvino.runtime")
warnings.warn(
"The `openvino.runtime` module is deprecated and will be removed in the 2026.0 release. "
"Please replace `openvino.runtime` with `openvino`.",
DeprecationWarning,
stacklevel=2
stacklevel=1
)


Expand Down
29 changes: 24 additions & 5 deletions src/common/snippets/include/snippets/emitter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,16 +52,21 @@ class Emitter {

/**
* @brief called by generator to generate code to produce target code for a specific operation
* @details
* Avoid passing default arguments to virtual function, but still allow user to call
* emit_code function without "pool" or "gpr"
* @param in vector of vector argument registers
* @param out vector of vector resulting registers
* @param pool optional vector of free vector registers which might be used inside method
* @param gpr vector of free generam puproce registers which might be used inside method
* @param gpr vector of free general purpose registers which might be used inside method
* @return void
*/
virtual void emit_code(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& pool = {},
const std::vector<size_t>& gpr = {}) const = 0;
void emit_code(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& pool = {},
const std::vector<size_t>& gpr = {}) const {
emit_code_impl(in, out, pool, gpr);
}

/**
* @brief called by generator to generate data section, if needed for a specific operation
Expand All @@ -70,6 +75,20 @@ class Emitter {
virtual void emit_data() const {}

virtual ~Emitter() = default;

private:
/**
* @brief called by generator to generate code to produce target code for a specific operation
* @param in vector of vector argument registers
* @param out vector of vector resulting registers
* @param pool optional vector of free vector registers which might be used inside method
* @param gpr vector of free general purpose registers which might be used inside method
* @return void
*/
virtual void emit_code_impl(const std::vector<size_t>& in,
const std::vector<size_t>& out,
const std::vector<size_t>& pool,
const std::vector<size_t>& gpr) const = 0;
};

} // namespace snippets
Expand Down
9 changes: 5 additions & 4 deletions src/common/snippets/tests/include/lowering_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,12 @@ using BlockedShapeVector = ov::snippets::op::Subgraph::BlockedShapeVector;
class DummyEmitter : public ov::snippets::Emitter {
public:
DummyEmitter(const std::vector<ov::Node::type_info_t>& custom_opset = {}) : ov::snippets::Emitter() {}
void emit_code(const std::vector<size_t>&,
const std::vector<size_t>&,
const std::vector<size_t>&,
const std::vector<size_t>&) const override {}
void emit_data() const override {}
protected:
void emit_code_impl(const std::vector<size_t>&,
const std::vector<size_t>&,
const std::vector<size_t>&,
const std::vector<size_t>&) const override {}
};

struct DummyCompiledSnippet : public ov::snippets::CompiledSnippet {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -723,6 +723,7 @@ ov::pass::RoPEFusionQwen::RoPEFusionQwen(int split_output_id) {
auto rotary_emb_cos = makePattern("[1,?,1,?]"); // [1,..4096,1,128]
auto rotary_emb_sin = makePattern("[1,?,1,?]"); // [1,..4096,1,128]
auto qkv_proj = makePattern("[?,?,?]"); // [?,?,12288]
auto position_ids = makePattern();

auto head_cnt = ov::gen_pattern::Symbol("head_cnt");
auto head_size = ov::gen_pattern::Symbol("head_size");
Expand All @@ -749,14 +750,19 @@ ov::pass::RoPEFusionQwen::RoPEFusionQwen(int split_output_id) {
auto ScatterUpdate_463814 = makePattern<opset3::ScatterUpdate>({{0, 0}, {1}, Gather_377635 | neg_Multiply, {0}});
auto slice_Slice_446 =
makePattern<ov::opset8::Slice>({rotary_emb_cos, Gather_377635 | neg_Multiply, {INT_MAX}, {1}, {1}});

auto gather_cos_by_pos_ids = makePattern<opset8::Gather>({rotary_emb_cos, position_ids, 1}, {{"batch_dims", 0}});
auto reshape_cos_to_expected_layout =
makePattern<opset8::Reshape>({gather_cos_by_pos_ids, {-1, 1, 1, 128}}, {{"special_zero", false}});

auto slice_StridedSlice_446 = GenStridedSlice(rotary_emb_cos,
ScatterUpdate_463814,
{0, INT_MAX},
{1, 1},
1); // tensor_array<f32[1,..4096,1,128]>
auto mul_Multiply_552 =
makePattern<opset1::Multiply>({slice_Slice_543, slice_StridedSlice_446 | slice_Slice_446},
{{"auto_broadcast", "numpy"}}); // tensor_array<f32[?,?,32,128]>
auto mul_Multiply_552 = makePattern<opset1::Multiply>(
{slice_Slice_543, slice_StridedSlice_446 | slice_Slice_446 | reshape_cos_to_expected_layout},
{{"auto_broadcast", "numpy"}}); // tensor_array<f32[?,?,32,128]>

auto reshape_opt1 = [&](std::shared_ptr<Node> input_BLHS) {
auto ShapeOf_485814 = makePattern<opset1::ShapeOf>({input_BLHS}, {});
Expand Down Expand Up @@ -790,18 +796,28 @@ ov::pass::RoPEFusionQwen::RoPEFusionQwen(int split_output_id) {
makePattern<opset1::Squeeze>({Multiply_567527, -2}); // tensor_array<f32[?,?,32,64]>
auto ListUnpack_586_Squeeze =
makePattern<opset1::Squeeze>({ListUnpack_586_Split->output(0), -2}); // tensor_array<f32[?,?,32,64]>
auto cat_Concat_593 = makePattern<opset1::Concat>({ListUnpack_586_Squeeze_0, ListUnpack_586_Squeeze},
{{"axis", -1}}); // tensor_array<f32[?,?,32,128]>

auto ListUnpack_Squeeze_0_1 =
makePattern<opset1::Reshape>({Multiply_567527, {-1, 1, 32, 64}}, {{"special_zero", false}});
auto ListUnpack_Squeeze_1 =
makePattern<opset1::Reshape>({ListUnpack_586_Split->output(0), {-1, 1, 32, 64}}, {{"special_zero", false}});

auto cat_Concat_593 = makePattern<opset1::Concat>(
{ListUnpack_586_Squeeze_0 | ListUnpack_Squeeze_0_1, ListUnpack_586_Squeeze | ListUnpack_Squeeze_1},
{{"axis", -1}}); // tensor_array<f32[?,?,32,128]>
auto slice_StridedSlice_470 = GenStridedSlice(rotary_emb_sin,
ScatterUpdate_463814,
{0, INT_MAX},
{1, 1},
1); // tensor_array<f32[1,..4096,1,128]>
auto slice_Slice_470 =
makePattern<opset8::Slice>({rotary_emb_sin, Gather_377635 | neg_Multiply, {INT_MAX}, {1}, {1}});
auto mul_Multiply_594 =
makePattern<opset1::Multiply>({cat_Concat_593, slice_StridedSlice_470 | slice_Slice_470},
{{"auto_broadcast", "numpy"}}); // tensor_array<f32[?,?,32,128]>
auto gather_sin_by_pos_ids = makePattern<opset8::Gather>({rotary_emb_sin, position_ids, 1}, {{"batch_dims", 0}});
auto reshape_sin_to_expected_layout =
makePattern<opset8::Reshape>({gather_sin_by_pos_ids, {-1, 1, 1, 128}}, {{"special_zero", false}});
auto mul_Multiply_594 = makePattern<opset1::Multiply>(
{cat_Concat_593, slice_StridedSlice_470 | slice_Slice_470 | reshape_sin_to_expected_layout},
{{"auto_broadcast", "numpy"}}); // tensor_array<f32[?,?,32,128]>
auto add_Add_597 = makePattern<opset1::Add>({mul_Multiply_552, mul_Multiply_594},
{{"auto_broadcast", "numpy"}}); // tensor_array<f32[?,?,32,128]>

Expand Down Expand Up @@ -858,16 +874,25 @@ ov::pass::RoPEFusionQwen::RoPEFusionQwen(int split_output_id) {
new_args.push_back(pattern_map.at(rotary_emb_cos));
new_args.push_back(pattern_map.at(rotary_emb_sin));

ov::NodeVector rt_from = {pattern_map.at(Multiply_567527).get_node_shared_ptr(),
pattern_map.at(cat_Concat_593).get_node_shared_ptr(),
pattern_map.at(mul_Multiply_594).get_node_shared_ptr(),
pattern_map.at(add_Add_597).get_node_shared_ptr()};

if (pattern_map.count(position_ids)) {
new_args.push_back(pattern_map.at(position_ids));
config.gather_position_arg_id = 3;
rt_from.push_back(pattern_map.at(ListUnpack_Squeeze_0_1).get_node_shared_ptr());
rt_from.push_back(pattern_map.at(ListUnpack_Squeeze_1).get_node_shared_ptr());
} else {
rt_from.push_back(pattern_map.at(ListUnpack_586_Squeeze_0).get_node_shared_ptr());
rt_from.push_back(pattern_map.at(ListUnpack_586_Squeeze).get_node_shared_ptr());
}

auto old_node = root;
auto new_node = std::make_shared<op::internal::RoPE>(new_args, config);
new_node->set_friendly_name(old_node->get_friendly_name());
ov::copy_runtime_info({pattern_map.at(Multiply_567527).get_node_shared_ptr(),
pattern_map.at(ListUnpack_586_Squeeze_0).get_node_shared_ptr(),
pattern_map.at(ListUnpack_586_Squeeze).get_node_shared_ptr(),
pattern_map.at(cat_Concat_593).get_node_shared_ptr(),
pattern_map.at(mul_Multiply_594).get_node_shared_ptr(),
pattern_map.at(add_Add_597).get_node_shared_ptr()},
new_node);
ov::copy_runtime_info(rt_from, new_node);
ov::replace_node(old_node, new_node);
return true;
};
Expand Down
Loading

0 comments on commit c97d9f1

Please sign in to comment.