-
Notifications
You must be signed in to change notification settings - Fork 0
/
vali.py
40 lines (28 loc) · 1.03 KB
/
vali.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import gc
import os
import time
import torch
from replace_hf import replace_linear_in_hf
os.environ['HF_HOME'] = r'D:\cache'
from transformers import AutoModelForCausalLM, AutoTokenizer
model_path = "results/checkpoint-37500"
def quick_test(model, tokenizer, prompt: str):
# Encode the inputs
inputs = tokenizer.encode(prompt, return_tensors="pt")
# Generate outputs
outputs = model.generate(inputs, max_length=18)
# Decode and print the outputs
print(tokenizer.decode(outputs[0]))
torch.set_default_device("cuda")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
model.config.use_cache = False
# Replace Linear layers with BitLinear
print(model)
replace_linear_in_hf(model, keep_param=True, custom_kernel=True)
print(model)
start_time = time.time()
for _ in range(10):
quick_test(model, tokenizer, prompt="Tom is the")
print(time.time() - start_time)