Merge remote-tracking branch 'upstream/master' into tj/rtti/helper-ma…

…cro/base
t-jankowski · Feb 13, 2025 · c97d9f1 · c97d9f1
2 parents 9f259a9 + b8cdf8c
commit c97d9f1
Show file tree

Hide file tree

Showing 72 changed files with 1,227 additions and 210 deletions.
diff --git a/docs/articles_en/assets/snippets/ov_caching.cpp b/docs/articles_en/assets/snippets/ov_caching.cpp
@@ -61,12 +61,36 @@ bool cachingSupported = std::find(caps.begin(), caps.end(), ov::device::capabili
 }
 
 void part4() {
+    std::string modelPath = "/tmp/myModel.xml";
+    std::string device = "GPU";
+    ov::Core core;                                           // Step 1: create ov::Core object
+    bool hasGPU = false;                                     // Step 1a: Check if GPU is available
+    auto devices = core.get_available_devices();
+    for (auto&& supported : devices) {
+        hasGPU |= supported.find(device) != std::string::npos;
+    }
+    if(!hasGPU) {
+        return;
+    }
+    core.set_property(ov::cache_dir("/path/to/cache/dir"));  // Step 1b: Enable caching
+//! [ov:caching:part4]
+// Note: model path needs to point to the *.xml file, not *.bin when using the IR model format.
+auto compiled = core.compile_model(modelPath,
+                                   device,
+                                   ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));
+//! [ov:caching:part4]
+    if (!compiled) {
+        throw std::runtime_error("error");
+    }
+}
+
+void part5() {
     std::string modelPath = "/tmp/myModel.xml";
     std::string device = "CPU";
     ov::Core core;                                           // Step 1: create ov::Core object
     core.set_property(ov::cache_dir("/path/to/cache/dir"));  // Step 1b: Enable caching
     auto model = core.read_model(modelPath);                 // Step 2: Read Model
-//! [ov:caching:part4]
+//! [ov:caching:part5]
 ov::AnyMap config;
 ov::EncryptionCallbacks encryption_callbacks;
 static const char codec_key[] = {0x30, 0x60, 0x70, 0x02, 0x04, 0x08, 0x3F, 0x6F, 0x72, 0x74, 0x78, 0x7F};
@@ -84,13 +108,13 @@ encryption_callbacks.encrypt = codec_xor;
 encryption_callbacks.decrypt = codec_xor;
 config.insert(ov::cache_encryption_callbacks(encryption_callbacks));  // Step 4: Set device configuration
 auto compiled = core.compile_model(model, device, config);            // Step 5: LoadNetwork
-//! [ov:caching:part4]
+//! [ov:caching:part5]
     if (!compiled) {
         throw std::runtime_error("error");
     }
 }
 
-void part5() {
+void part6() {
     std::string modelPath = "/tmp/myModel.xml";
     std::string device = "GPU";
     ov::Core core;                                           // Step 1: create ov::Core object
@@ -103,7 +127,7 @@ void part5() {
         return;
     }
     core.set_property(ov::cache_dir("/path/to/cache/dir"));  // Step 1b: Enable caching
-//! [ov:caching:part5]
+//! [ov:caching:part6]
 static const char codec_key[] = {0x30, 0x60, 0x70, 0x02, 0x04, 0x08, 0x3F, 0x6F, 0x72, 0x74, 0x78, 0x7F};
 auto codec_xor = [&](const std::string& source_str) {
     auto key_size = sizeof(codec_key);
@@ -119,7 +143,7 @@ auto compiled = core.compile_model(modelPath,
                                    device,
                                    ov::cache_encryption_callbacks(ov::EncryptionCallbacks{codec_xor, codec_xor}),
                                    ov::cache_mode(ov::CacheMode::OPTIMIZE_SIZE));  // Step 5: Compile model
-//! [ov:caching:part5]
+//! [ov:caching:part6]
     if (!compiled) {
         throw std::runtime_error("error");
     }
@@ -133,6 +157,7 @@ int main() {
         part3();
         part4();
         part5();
+        part6();
     } catch (...) {
     }
     return 0;

diff --git a/docs/articles_en/assets/snippets/ov_caching.py b/docs/articles_en/assets/snippets/ov_caching.py
@@ -44,6 +44,16 @@
 # ! [ov:caching:part3]
 
 # ! [ov:caching:part4]
+core = ov.Core()
+if "GPU" in core.available_devices:
+    core.set_property({props.cache_dir: path_to_cache_dir})
+    config_cache = {}
+    config_cache["CACHE_MODE"] = "OPTIMIZE_SIZE"
+    # Note: model path needs to point to the *.xml file, not *.bin when using the IR model format.
+    compiled_model = core.compile_model(model=model_path, device_name='GPU', config=config_cache)
+# ! [ov:caching:part4]
+
+# ! [ov:caching:part5]
 import base64
 
 def encrypt_base64(src):
@@ -58,9 +68,9 @@ def decrypt_base64(src):
 config_cache["CACHE_ENCRYPTION_CALLBACKS"] = [encrypt_base64, decrypt_base64]
 model = core.read_model(model=model_path)
 compiled_model = core.compile_model(model=model, device_name=device_name, config=config_cache)
-# ! [ov:caching:part4]
-
 # ! [ov:caching:part5]
+
+# ! [ov:caching:part6]
 import base64
 
 def encrypt_base64(src):
@@ -76,4 +86,4 @@ def decrypt_base64(src):
     config_cache["CACHE_ENCRYPTION_CALLBACKS"] = [encrypt_base64, decrypt_base64]
     config_cache["CACHE_MODE"] = "OPTIMIZE_SIZE"
     compiled_model = core.compile_model(model=model_path, device_name='GPU', config=config_cache)
-# ! [ov:caching:part5]
+# ! [ov:caching:part6]
diff --git a/...ning-inference/optimize-inference/optimizing-latency/model-caching-overview.rst b/...ning-inference/optimize-inference/optimizing-latency/model-caching-overview.rst
@@ -140,6 +140,35 @@ model caching, use the following code in your application:
          :language: cpp
          :fragment: [ov:caching:part3]
 
+Set ``CacheMode`` property to ``OPTIMIZE_SIZE`` to enable weightless caching
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+
+Weightless caching is a feature that allows you to create a cache file which doesn't contain the weights of the model. Instead, the weights are loaded from the original model file. This helps to reduce the size of the cache file.
+
+.. tab-set::
+
+   .. tab-item:: Python
+      :sync: py
+
+      .. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.py
+         :language: py
+         :fragment: [ov:caching:part4]
+
+   .. tab-item:: C++
+      :sync: cpp
+
+      .. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.cpp
+         :language: cpp
+         :fragment: [ov:caching:part4]
+
+.. important::
+
+   Currently, this property is supported only by the GPU Plugin and IR model format.
+
+.. important::
+
+   Some weights which undergo transformations during model compilation may not be eligible for weightless caching. In such cases, the cache file will contain these weights while still using the weightless caching mechanism for the rest. The feature supports some of the common transformations and replicates them after loading the model from the cache.
+
 Enable cache encryption
 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 
@@ -154,16 +183,16 @@ loading it from the cache. Currently, this property can be set only in ``compile
 
       .. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.py
          :language: py
-         :fragment: [ov:caching:part4]
+         :fragment: [ov:caching:part5]
 
    .. tab-item:: C++
       :sync: cpp
 
       .. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.cpp
          :language: cpp
-         :fragment: [ov:caching:part4]
+         :fragment: [ov:caching:part5]
 
-Full encryption only works when the ``CacheMode`` property is set to ``OPTIMIZE_SIZE``.
+If model caching is enabled in the GPU Plugin, the model topology can be encrypted while it is saved to the cache and decrypted when it is loaded from the cache. Full encryption only works when the ``CacheMode`` property is set to ``OPTIMIZE_SIZE``.
 
 .. tab-set::
 
@@ -172,14 +201,14 @@ Full encryption only works when the ``CacheMode`` property is set to ``OPTIMIZE_
 
       .. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.py
          :language: py
-         :fragment: [ov:caching:part5]
+         :fragment: [ov:caching:part6]
 
    .. tab-item:: C++
       :sync: cpp
 
       .. doxygensnippet:: docs/articles_en/assets/snippets/ov_caching.cpp
          :language: cpp
-         :fragment: [ov:caching:part5]
+         :fragment: [ov:caching:part6]
 
 .. important::
 

diff --git a/src/bindings/python/src/openvino/runtime/__init__.py b/src/bindings/python/src/openvino/runtime/__init__.py
@@ -6,12 +6,12 @@
 # noqa: F401
 
 import warnings
-warnings.simplefilter("always", DeprecationWarning)
+warnings.filterwarnings("once", category=DeprecationWarning, module="openvino.runtime")
 warnings.warn(
     "The `openvino.runtime` module is deprecated and will be removed in the 2026.0 release. "
     "Please replace `openvino.runtime` with `openvino`.",
     DeprecationWarning,
-    stacklevel=2
+    stacklevel=1
 )
 
 

diff --git a/src/common/snippets/include/snippets/emitter.hpp b/src/common/snippets/include/snippets/emitter.hpp
@@ -52,16 +52,21 @@ class Emitter {
 
     /**
      * @brief called by generator to generate code to produce target code for a specific operation
+     * @details
+     *   Avoid passing default arguments to virtual function, but still allow user to call
+     *   emit_code function without "pool" or "gpr"
      * @param in vector of vector argument registers
      * @param out vector of vector resulting registers
      * @param pool optional vector of free vector registers which might be used inside method
-     * @param gpr vector of free generam puproce registers which might be used inside method
+     * @param gpr vector of free general purpose registers which might be used inside method
      * @return void
      */
-    virtual void emit_code(const std::vector<size_t>& in,
-                           const std::vector<size_t>& out,
-                           const std::vector<size_t>& pool = {},
-                           const std::vector<size_t>& gpr  = {}) const = 0;
+    void emit_code(const std::vector<size_t>& in,
+                        const std::vector<size_t>& out,
+                        const std::vector<size_t>& pool = {},
+                        const std::vector<size_t>& gpr = {}) const {
+        emit_code_impl(in, out, pool, gpr);
+    }
 
     /**
      * @brief called by generator to generate data section, if needed for a specific operation
@@ -70,6 +75,20 @@ class Emitter {
     virtual void emit_data() const {}
 
     virtual ~Emitter() = default;
+
+private:
+    /**
+     * @brief called by generator to generate code to produce target code for a specific operation
+     * @param in vector of vector argument registers
+     * @param out vector of vector resulting registers
+     * @param pool optional vector of free vector registers which might be used inside method
+     * @param gpr vector of free general purpose registers which might be used inside method
+     * @return void
+     */
+    virtual void emit_code_impl(const std::vector<size_t>& in,
+                                const std::vector<size_t>& out,
+                                const std::vector<size_t>& pool,
+                                const std::vector<size_t>& gpr) const = 0;
 };
 
 } // namespace snippets

diff --git a/src/common/snippets/tests/include/lowering_utils.hpp b/src/common/snippets/tests/include/lowering_utils.hpp
@@ -19,11 +19,12 @@ using BlockedShapeVector = ov::snippets::op::Subgraph::BlockedShapeVector;
 class DummyEmitter : public ov::snippets::Emitter {
 public:
     DummyEmitter(const std::vector<ov::Node::type_info_t>& custom_opset = {}) : ov::snippets::Emitter() {}
-    void emit_code(const std::vector<size_t>&,
-                   const std::vector<size_t>&,
-                   const std::vector<size_t>&,
-                   const std::vector<size_t>&) const override {}
     void emit_data() const override {}
+protected:
+    void emit_code_impl(const std::vector<size_t>&,
+                        const std::vector<size_t>&,
+                        const std::vector<size_t>&,
+                        const std::vector<size_t>&) const override {}
 };
 
 struct DummyCompiledSnippet : public ov::snippets::CompiledSnippet {

diff --git a/...formations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp b/...formations/src/transformations/common_optimizations/fuse_rotary_positional_embeddings.cpp
@@ -723,6 +723,7 @@ ov::pass::RoPEFusionQwen::RoPEFusionQwen(int split_output_id) {
     auto rotary_emb_cos = makePattern("[1,?,1,?]");  // [1,..4096,1,128]
     auto rotary_emb_sin = makePattern("[1,?,1,?]");  // [1,..4096,1,128]
     auto qkv_proj = makePattern("[?,?,?]");          // [?,?,12288]
+    auto position_ids = makePattern();
 
     auto head_cnt = ov::gen_pattern::Symbol("head_cnt");
     auto head_size = ov::gen_pattern::Symbol("head_size");
@@ -749,14 +750,19 @@ ov::pass::RoPEFusionQwen::RoPEFusionQwen(int split_output_id) {
     auto ScatterUpdate_463814 = makePattern<opset3::ScatterUpdate>({{0, 0}, {1}, Gather_377635 | neg_Multiply, {0}});
     auto slice_Slice_446 =
         makePattern<ov::opset8::Slice>({rotary_emb_cos, Gather_377635 | neg_Multiply, {INT_MAX}, {1}, {1}});
+
+    auto gather_cos_by_pos_ids = makePattern<opset8::Gather>({rotary_emb_cos, position_ids, 1}, {{"batch_dims", 0}});
+    auto reshape_cos_to_expected_layout =
+        makePattern<opset8::Reshape>({gather_cos_by_pos_ids, {-1, 1, 1, 128}}, {{"special_zero", false}});
+
     auto slice_StridedSlice_446 = GenStridedSlice(rotary_emb_cos,
                                                   ScatterUpdate_463814,
                                                   {0, INT_MAX},
                                                   {1, 1},
                                                   1);  //  tensor_array<f32[1,..4096,1,128]>
-    auto mul_Multiply_552 =
-        makePattern<opset1::Multiply>({slice_Slice_543, slice_StridedSlice_446 | slice_Slice_446},
-                                      {{"auto_broadcast", "numpy"}});  //  tensor_array<f32[?,?,32,128]>
+    auto mul_Multiply_552 = makePattern<opset1::Multiply>(
+        {slice_Slice_543, slice_StridedSlice_446 | slice_Slice_446 | reshape_cos_to_expected_layout},
+        {{"auto_broadcast", "numpy"}});  //  tensor_array<f32[?,?,32,128]>
 
     auto reshape_opt1 = [&](std::shared_ptr<Node> input_BLHS) {
         auto ShapeOf_485814 = makePattern<opset1::ShapeOf>({input_BLHS}, {});
@@ -790,18 +796,28 @@ ov::pass::RoPEFusionQwen::RoPEFusionQwen(int split_output_id) {
         makePattern<opset1::Squeeze>({Multiply_567527, -2});  //  tensor_array<f32[?,?,32,64]>
     auto ListUnpack_586_Squeeze =
         makePattern<opset1::Squeeze>({ListUnpack_586_Split->output(0), -2});  //  tensor_array<f32[?,?,32,64]>
-    auto cat_Concat_593 = makePattern<opset1::Concat>({ListUnpack_586_Squeeze_0, ListUnpack_586_Squeeze},
-                                                      {{"axis", -1}});  //  tensor_array<f32[?,?,32,128]>
+
+    auto ListUnpack_Squeeze_0_1 =
+        makePattern<opset1::Reshape>({Multiply_567527, {-1, 1, 32, 64}}, {{"special_zero", false}});
+    auto ListUnpack_Squeeze_1 =
+        makePattern<opset1::Reshape>({ListUnpack_586_Split->output(0), {-1, 1, 32, 64}}, {{"special_zero", false}});
+
+    auto cat_Concat_593 = makePattern<opset1::Concat>(
+        {ListUnpack_586_Squeeze_0 | ListUnpack_Squeeze_0_1, ListUnpack_586_Squeeze | ListUnpack_Squeeze_1},
+        {{"axis", -1}});  //  tensor_array<f32[?,?,32,128]>
     auto slice_StridedSlice_470 = GenStridedSlice(rotary_emb_sin,
                                                   ScatterUpdate_463814,
                                                   {0, INT_MAX},
                                                   {1, 1},
                                                   1);  //  tensor_array<f32[1,..4096,1,128]>
     auto slice_Slice_470 =
         makePattern<opset8::Slice>({rotary_emb_sin, Gather_377635 | neg_Multiply, {INT_MAX}, {1}, {1}});
-    auto mul_Multiply_594 =
-        makePattern<opset1::Multiply>({cat_Concat_593, slice_StridedSlice_470 | slice_Slice_470},
-                                      {{"auto_broadcast", "numpy"}});  //  tensor_array<f32[?,?,32,128]>
+    auto gather_sin_by_pos_ids = makePattern<opset8::Gather>({rotary_emb_sin, position_ids, 1}, {{"batch_dims", 0}});
+    auto reshape_sin_to_expected_layout =
+        makePattern<opset8::Reshape>({gather_sin_by_pos_ids, {-1, 1, 1, 128}}, {{"special_zero", false}});
+    auto mul_Multiply_594 = makePattern<opset1::Multiply>(
+        {cat_Concat_593, slice_StridedSlice_470 | slice_Slice_470 | reshape_sin_to_expected_layout},
+        {{"auto_broadcast", "numpy"}});  //  tensor_array<f32[?,?,32,128]>
     auto add_Add_597 = makePattern<opset1::Add>({mul_Multiply_552, mul_Multiply_594},
                                                 {{"auto_broadcast", "numpy"}});  //  tensor_array<f32[?,?,32,128]>
 
@@ -858,16 +874,25 @@ ov::pass::RoPEFusionQwen::RoPEFusionQwen(int split_output_id) {
         new_args.push_back(pattern_map.at(rotary_emb_cos));
         new_args.push_back(pattern_map.at(rotary_emb_sin));
 
+        ov::NodeVector rt_from = {pattern_map.at(Multiply_567527).get_node_shared_ptr(),
+                                  pattern_map.at(cat_Concat_593).get_node_shared_ptr(),
+                                  pattern_map.at(mul_Multiply_594).get_node_shared_ptr(),
+                                  pattern_map.at(add_Add_597).get_node_shared_ptr()};
+
+        if (pattern_map.count(position_ids)) {
+            new_args.push_back(pattern_map.at(position_ids));
+            config.gather_position_arg_id = 3;
+            rt_from.push_back(pattern_map.at(ListUnpack_Squeeze_0_1).get_node_shared_ptr());
+            rt_from.push_back(pattern_map.at(ListUnpack_Squeeze_1).get_node_shared_ptr());
+        } else {
+            rt_from.push_back(pattern_map.at(ListUnpack_586_Squeeze_0).get_node_shared_ptr());
+            rt_from.push_back(pattern_map.at(ListUnpack_586_Squeeze).get_node_shared_ptr());
+        }
+
         auto old_node = root;
         auto new_node = std::make_shared<op::internal::RoPE>(new_args, config);
         new_node->set_friendly_name(old_node->get_friendly_name());
-        ov::copy_runtime_info({pattern_map.at(Multiply_567527).get_node_shared_ptr(),
-                               pattern_map.at(ListUnpack_586_Squeeze_0).get_node_shared_ptr(),
-                               pattern_map.at(ListUnpack_586_Squeeze).get_node_shared_ptr(),
-                               pattern_map.at(cat_Concat_593).get_node_shared_ptr(),
-                               pattern_map.at(mul_Multiply_594).get_node_shared_ptr(),
-                               pattern_map.at(add_Add_597).get_node_shared_ptr()},
-                              new_node);
+        ov::copy_runtime_info(rt_from, new_node);
         ov::replace_node(old_node, new_node);
         return true;
     };