microsoft · kinfey · Jan 16, 2025 · Jan 13, 2025 · Jan 13, 2025 · Jan 13, 2025
diff --git a/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb b/code/03.Inference/AIPC/AIPC_NPU_DEMO.ipynb
@@ -15,7 +15,8 @@
    "outputs": [],
    "source": [
     "from transformers import AutoTokenizer, pipeline,TextStreamer\n",
-    "import intel_npu_acceleration_library as npu_lib\n",
+    "from intel_npu_acceleration_library import NPUModelForCausalLM, int4\n",
+    "from intel_npu_acceleration_library.compiler import CompilerConfig\n",
     "\n",
     "\n",
     "import warnings"
@@ -84,12 +85,10 @@
     }
    ],
    "source": [
-    "model = npu_lib.NPUModelForCausalLM.from_pretrained(\n",
-    "                                    model_id,\n",
-    "                                    torch_dtype=\"auto\",\n",
-    "                                    dtype=npu_lib.int4,\n",
-    "                                    trust_remote_code=True\n",
-    "                                )\n",
+    "compiler_conf = CompilerConfig(dtype=int4)\n",
+    "model = NPUModelForCausalLM.from_pretrained(\n",
+    "    model_id, use_cache=True, config=compiler_conf, attn_implementation='sdpa'\n",
+    ").eval()\n",
     "\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
     "\n",

diff --git a/md/03.Inference/AIPC_Inference.md b/md/03.Inference/AIPC_Inference.md
@@ -46,51 +46,45 @@ Install the Python Library with pip
 Using Intel NPU acceleration, this library does not affect the traditional encoding process. You only need to use this library to quantize the original Phi-3 model, such as FP16，INT8，INT4，such as 
 
 ```python
-
 from transformers import AutoTokenizer, pipeline,TextStreamer
-import intel_npu_acceleration_library as npu_lib
+from intel_npu_acceleration_library import NPUModelForCausalLM, int4
+from intel_npu_acceleration_library.compiler import CompilerConfig
 import warnings
 
 model_id = "microsoft/Phi-3-mini-4k-instruct"
 
-model = npu_lib.NPUModelForCausalLM.from_pretrained(
-                                    model_id,
-                                    torch_dtype="auto",
-                                    dtype=npu_lib.int4,
-                                    trust_remote_code=True
-                                )
+compiler_conf = CompilerConfig(dtype=int4)
+model = NPUModelForCausalLM.from_pretrained(
+    model_id, use_cache=True, config=compiler_conf, attn_implementation="sdpa"
+).eval()
 
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 text_streamer = TextStreamer(tokenizer, skip_prompt=True)
-
 ```
-After the quantification is successful, continue execution to call the NPU to run the Phi-3 model.
 
+After the quantification is successful, continue execution to call the NPU to run the Phi-3 model.
 
 ```python
-
 generation_args = {
-            "max_new_tokens": 1024,
-            "return_full_text": False,
-            "temperature": 0.3,
-            "do_sample": False,
-            "streamer": text_streamer,
-        }
+   "max_new_tokens": 1024,
+   "return_full_text": False,
+   "temperature": 0.3,
+   "do_sample": False,
+   "streamer": text_streamer,
+}
 
 pipe = pipeline(
-            "text-generation",
-            model=model,
-            tokenizer=tokenizer,
+   "text-generation",
+   model=model,
+   tokenizer=tokenizer,
 )
 
 query = "<|system|>You are a helpful AI assistant.<|end|><|user|>Can you introduce yourself?<|end|><|assistant|>"
 
 with warnings.catch_warnings():
     warnings.simplefilter("ignore")
     pipe(query, **generation_args)
-
-
 ```
 
 When executing code, we can view the running status of the NPU through Task Manager