From 85fa72e510d1d3de6d44e3a7d791a08b89ec7e48 Mon Sep 17 00:00:00 2001
From: Jaylyn Barbee <51131738+Jaylyn-Barbee@users.noreply.github.com>
Date: Mon, 13 Jan 2025 13:13:13 -0500
Subject: [PATCH 1/5] Update AIPC_Inference.md

---
 md/03.Inference/AIPC_Inference.md | 38 +++++++++++++------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/md/03.Inference/AIPC_Inference.md b/md/03.Inference/AIPC_Inference.md
index 2427fde6..34f298d8 100644
--- a/md/03.Inference/AIPC_Inference.md
+++ b/md/03.Inference/AIPC_Inference.md
@@ -46,42 +46,38 @@ Install the Python Library with pip
 Using Intel NPU acceleration, this library does not affect the traditional encoding process. You only need to use this library to quantize the original Phi-3 model, such as FP16，INT8，INT4，such as 
 
 ```python
-
 from transformers import AutoTokenizer, pipeline,TextStreamer
-import intel_npu_acceleration_library as npu_lib
+from intel_npu_acceleration_library import NPUModelForCausalLM, int4
+from intel_npu_acceleration_library.compiler import CompilerConfig
 import warnings
 
 model_id = "microsoft/Phi-3-mini-4k-instruct"
 
-model = npu_lib.NPUModelForCausalLM.from_pretrained(
-                                    model_id,
-                                    torch_dtype="auto",
-                                    dtype=npu_lib.int4,
-                                    trust_remote_code=True
-                                )
+compiler_conf = CompilerConfig(dtype=int4)
+model = NPUModelForCausalLM.from_pretrained(
+    model_id, use_cache=True, config=compiler_conf, attn_implementation="sdpa"
+).eval()
 
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 text_streamer = TextStreamer(tokenizer, skip_prompt=True)
-
 ```
-After the quantification is successful, continue execution to call the NPU to run the Phi-3 model.
 
+After the quantification is successful, continue execution to call the NPU to run the Phi-3 model.
 
 ```python
-
 generation_args = {
-            "max_new_tokens": 1024,
-            "return_full_text": False,
-            "temperature": 0.3,
-            "do_sample": False,
-            "streamer": text_streamer,
-        }
+   "max_new_tokens": 1024,
+   "return_full_text": False,
+   "temperature": 0.3,
+   "do_sample": False,
+   "streamer": text_streamer,
+}
 
 pipe = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
+   "text-generation",
+   model=model,
+   tokenizer=tokenizer,
 )
 
 query = "<|system|>You are a helpful AI assistant.<|end|><|user|>Can you introduce yourself?<|end|><|assistant|>"
@@ -89,8 +85,6 @@ query = "<|system|>You are a helpful AI assistant.<|end|><|user|>Can you introdu
 with warnings.catch_warnings():
     warnings.simplefilter("ignore")
     pipe(query, **generation_args)
-
-
 ```
 
 When executing code, we can view the running status of the NPU through Task Manager

From b0efcee96d2b135b4f580d2f499e75a46964788f Mon Sep 17 00:00:00 2001
From: Jaylyn Barbee <51131738+Jaylyn-Barbee@users.noreply.github.com>
Date: Mon, 13 Jan 2025 13:20:10 -0500
Subject: [PATCH 2/5] Update AIPC_NPU_DEMO.ipynb

---
 code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb b/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb
index b7571224..6a428277 100644
--- a/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb
+++ b/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb
@@ -15,7 +15,8 @@
    "outputs": [],
    "source": [
     "from transformers import AutoTokenizer, pipeline,TextStreamer\n",
-    "import intel_npu_acceleration_library as npu_lib\n",
+    "from intel_npu_acceleration_library import NPUModelForCausalLM, int4\n",
+    "from intel_npu_acceleration_library.compiler import CompilerConfig\n",
     "\n",
     "\n",
     "import warnings"
@@ -84,12 +85,10 @@
     }
    ],
    "source": [
-    "model = npu_lib.NPUModelForCausalLM.from_pretrained(\n",
-    "                                    model_id,\n",
-    "                                    torch_dtype=\"auto\",\n",
-    "                                    dtype=npu_lib.int4,\n",
-    "                                    trust_remote_code=True\n",
-    "                                )\n",
+    "compiler_conf = CompilerConfig(dtype=int4)\n"
+    "model = NPUModelForCausalLM.from_pretrained(\n",
+    "model_id, use_cache=True, config=compiler_conf, attn_implementation="sdpa"\n"
+    ").eval()\n"
     "\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
     "\n",

From 09e86c742d5f2c072cac5930490b939a9b7d6a34 Mon Sep 17 00:00:00 2001
From: Jaylyn Barbee <51131738+Jaylyn-Barbee@users.noreply.github.com>
Date: Mon, 13 Jan 2025 13:20:41 -0500
Subject: [PATCH 3/5] Update AIPC_NPU_DEMO.ipynb

---
 code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb b/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb
index 6a428277..7d8e541d 100644
--- a/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb
+++ b/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb
@@ -85,10 +85,10 @@
     }
    ],
    "source": [
-    "compiler_conf = CompilerConfig(dtype=int4)\n"
+    "compiler_conf = CompilerConfig(dtype=int4)\n",
     "model = NPUModelForCausalLM.from_pretrained(\n",
-    "model_id, use_cache=True, config=compiler_conf, attn_implementation="sdpa"\n"
-    ").eval()\n"
+    "model_id, use_cache=True, config=compiler_conf, attn_implementation="sdpa"\n",
+    ").eval()\n",
     "\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
     "\n",

From f5c8e3831f7fd101bf8b03b92541a5b9536072f8 Mon Sep 17 00:00:00 2001
From: Jaylyn Barbee <51131738+Jaylyn-Barbee@users.noreply.github.com>
Date: Mon, 13 Jan 2025 13:21:58 -0500
Subject: [PATCH 4/5] Update AIPC_NPU_DEMO.ipynb

---
 code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb b/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb
index 7d8e541d..6099fecc 100644
--- a/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb
+++ b/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb
@@ -87,7 +87,7 @@
    "source": [
     "compiler_conf = CompilerConfig(dtype=int4)\n",
     "model = NPUModelForCausalLM.from_pretrained(\n",
-    "model_id, use_cache=True, config=compiler_conf, attn_implementation="sdpa"\n",
+    "model_id, use_cache=True, config=compiler_conf, attn_implementation='sdpa'\n",
     ").eval()\n",
     "\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",

From 277453d275ceef8828429a25f9e0e718699883a5 Mon Sep 17 00:00:00 2001
From: Jaylyn Barbee <51131738+Jaylyn-Barbee@users.noreply.github.com>
Date: Mon, 13 Jan 2025 13:22:22 -0500
Subject: [PATCH 5/5] Update AIPC_NPU_DEMO.ipynb

---
 code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb b/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb
index 6099fecc..ba4413fb 100644
--- a/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb
+++ b/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb
@@ -87,7 +87,7 @@
    "source": [
     "compiler_conf = CompilerConfig(dtype=int4)\n",
     "model = NPUModelForCausalLM.from_pretrained(\n",
-    "model_id, use_cache=True, config=compiler_conf, attn_implementation='sdpa'\n",
+    "    model_id, use_cache=True, config=compiler_conf, attn_implementation='sdpa'\n",
     ").eval()\n",
     "\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",