Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Commit

Permalink
[LLM Runtime] Support load_in_nbit in llm runtime (#688)
Browse files Browse the repository at this point in the history
* support load_in_nbit in llm runtime

Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
  • Loading branch information
zhenwei-intel authored Nov 15, 2023
1 parent cd40423 commit 4423f70
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 15 deletions.
10 changes: 4 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,32 +61,30 @@ Below is the sample code to enable weight-only INT4/INT8 inference. See more [ex
### INT4 Inference
```python
from transformers import AutoTokenizer, TextStreamer
from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig
from intel_extension_for_transformers.transformers import AutoModelForCausalLM
model_name = "Intel/neural-chat-7b-v1-1" # Hugging Face model_id or local model
config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4")
prompt = "Once upon a time, there existed a little girl,"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
inputs = tokenizer(prompt, return_tensors="pt").input_ids
streamer = TextStreamer(tokenizer)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=config)
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True)
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
```

### INT8 Inference
```python
from transformers import AutoTokenizer, TextStreamer
from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig
from intel_extension_for_transformers.transformers import AutoModelForCausalLM
model_name = "Intel/neural-chat-7b-v1-1" # Hugging Face model_id or local model
config = WeightOnlyQuantConfig(compute_dtype="bf16", weight_dtype="int8")
prompt = "Once upon a time, there existed a little girl,"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
inputs = tokenizer(prompt, return_tensors="pt").input_ids
streamer = TextStreamer(tokenizer)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=config)
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True)
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
```

Expand Down
5 changes: 2 additions & 3 deletions intel_extension_for_transformers/llm/runtime/graph/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,16 +64,15 @@ pip install intel-extension-for-transformers
You can use Python API to run Hugging Face model simply. Here is the sample code:
```python
from transformers import AutoTokenizer, TextStreamer
from intel_extension_for_transformers.transformers import AutoModelForCausalLM, WeightOnlyQuantConfig
from intel_extension_for_transformers.transformers import AutoModelForCausalLM
model_name = "Intel/neural-chat-7b-v1-1" # Hugging Face model_id or local model
config = WeightOnlyQuantConfig(compute_dtype="int8", weight_dtype="int4")
prompt = "Once upon a time, there existed a little girl,"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
inputs = tokenizer(prompt, return_tensors="pt").input_ids
streamer = TextStreamer(tokenizer)

model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=config)
model = AutoModelForCausalLM.from_pretrained(model_name, load_in_4bit=True)
outputs = model.generate(inputs, streamer=streamer, max_new_tokens=300)
```

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,9 +101,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
torch_dtype = kwargs.pop("torch_dtype", torch.float32)
if load_in_4bit:
if quantization_config is None:
quantization_config = WeightOnlyQuantConfig(
compute_dtype=torch_dtype, weight_dtype="nf4"
)
if use_llm_runtime:
quantization_config = WeightOnlyQuantConfig(
compute_dtype="int8", weight_dtype="int4"
)
else:
quantization_config = WeightOnlyQuantConfig(
compute_dtype=torch_dtype, weight_dtype="nf4"
)
else:
assert (
"4" in quantization_config.weight_dtype
Expand All @@ -112,9 +117,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
f"'fp4_e2m1' or 'fp4_e2m1_bnb' and compute_dtype should be {torch_dtype}."
elif load_in_8bit:
if quantization_config is None:
quantization_config = WeightOnlyQuantConfig(
compute_dtype=torch_dtype, weight_dtype="int8"
)
if use_llm_runtime:
quantization_config = WeightOnlyQuantConfig(
compute_dtype="bf16", weight_dtype="int8"
)
else:
quantization_config = WeightOnlyQuantConfig(
compute_dtype=torch_dtype, weight_dtype="int8"
)
else:
assert (
quantization_config.weight_dtype == "int8"
Expand Down

0 comments on commit 4423f70

Please sign in to comment.