Skip to content

Commit

Permalink
[Qwen2] create random lora model in test
Browse files Browse the repository at this point in the history
  • Loading branch information
chuxiaoyi2023 committed Nov 5, 2024
1 parent c2e53b9 commit 9f5eae8
Show file tree
Hide file tree
Showing 10 changed files with 235 additions and 157 deletions.
23 changes: 19 additions & 4 deletions models/Qwen2/lora_demo/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,26 @@ elseif (${TARGET_ARCH} STREQUAL "pcie")
message("PCIE mode, starting......")
endif()

add_definitions(-DDEBUG --std=c++17 -fPIC -Wall -Werror)
if (CMAKE_TYPE STREQUAL "DUMP")
add_definitions(-DDUMP_TENSOR)
add_definitions(-DDEBUG --std=c++17 -fPIC -Wall -lcnpy)
include_directories(${PROJECT_SOURCE_DIR}/third_party/cnpy)
add_subdirectory(third_party/cnpy)
else()
add_definitions(-DDEBUG --std=c++17 -fPIC -Wall)
endif()

set(CMAKE_BUILD_TYPE "Debug")

find_package(pybind11 REQUIRED CONFIG)

pybind11_add_module(chat chat.cpp)
target_link_libraries(chat PUBLIC bmrt bmlib)
install(TARGETS chat DESTINATION python)
file(GLOB CPP_FILES ${PROJECT_SOURCE_DIR}/*.cpp)


pybind11_add_module(chat ${CPP_FILES})
if (CMAKE_TYPE STREQUAL "DUMP")
target_link_libraries(chat PUBLIC bmrt bmlib cnpy)
else()
target_link_libraries(chat PUBLIC bmrt bmlib)
endif()
install(TARGETS chat DESTINATION python)
35 changes: 35 additions & 0 deletions models/Qwen2/lora_demo/adapter_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
{
"alpha_pattern": {},
"auto_mapping": null,
"base_model_name_or_path": "/data_public/Qwen2-7B",
"bias": "none",
"fan_in_fan_out": false,
"inference_mode": true,
"init_lora_weights": true,
"layer_replication": null,
"layers_pattern": null,
"layers_to_transform": null,
"loftq_config": {},
"lora_alpha": 16,
"lora_dropout": 0,
"megatron_config": null,
"megatron_core": "megatron.core",
"modules_to_save": null,
"peft_type": "LORA",
"r": 8,
"rank_pattern": {},
"revision": null,
"target_modules": [
"embed_tokens",
"up_proj",
"gate_proj",
"down_proj",
"k_proj",
"o_proj",
"v_proj",
"q_proj"
],
"task_type": "CAUSAL_LM",
"use_dora": false,
"use_rslora": false
}
3 changes: 3 additions & 0 deletions models/Qwen2/lora_demo/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,9 @@ int Qwen::forward_first(std::vector<int> &tokens) {
total_length * kv_bytes);
}

// test lora
dump_net_output_to_file(bm_handle, net_blocks[NUM_LAYERS-1], "test_lora/bmodel_hidden_states.npz");

// forward lmhead
auto lm_out_mem = lm_launch(net_lm, out_mem,
(total_length - 1) * hidden_bytes, hidden_bytes);
Expand Down
17 changes: 9 additions & 8 deletions models/Qwen2/lora_demo/export_onnx.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ def load_lora_model(origin_model, path):
return lora_model, lora_config


def convert_lora_to_bit(lora_model, lora_config, lora_scale, lora_offset, args):
def convert_lora_to_bit(lora_model, lora_config, args):
# extract layer from model
lora_weight_list = []
for i in range(len(lora_model.base_model.model.model.layers)):
Expand Down Expand Up @@ -347,7 +347,7 @@ def convert_lora_to_bit(lora_model, lora_config, lora_scale, lora_offset, args):
lora_weight_list.append(a)

# Flatten the weights and convert to uint32
lora_weights_fp32 = np.concatenate([(w.flatten() + lora_offset) * lora_scale for w in lora_weight_list])
lora_weights_fp32 = np.concatenate([w.flatten() for w in lora_weight_list])
lora_weights_fp32 = lora_weights_fp32
lora_weights_uint32 = lora_weights_fp32.view(np.uint32)
lora_weights_uint16 = (lora_weights_uint32 >> 16).astype(np.uint16) # Convert to bfloat16
Expand Down Expand Up @@ -379,7 +379,7 @@ def convert_lora_embedding():
)


def convert_lora_embedding_to_bit(lora_model, lora_config, lora_embedding_scale, lora_offset, args):
def convert_lora_embedding_to_bit(lora_model, lora_config, args):
# extract layer from model
lora_weight_list = []
lora_layers = lora_model.base_model.model.model.embed_tokens
Expand Down Expand Up @@ -411,7 +411,7 @@ def convert_lora_embedding_to_bit(lora_model, lora_config, lora_embedding_scale,
lora_weight_list.append(b)

# Flatten the weights and convert to uint32
lora_weights_fp32 = np.concatenate([(w.flatten() + lora_offset) * lora_embedding_scale for w in lora_weight_list])
lora_weights_fp32 = np.concatenate([w.flatten() for w in lora_weight_list])
lora_weights_uint32 = lora_weights_fp32.view(np.uint32)
lora_weights_uint16 = (lora_weights_uint32 >> 16).astype(np.uint16) # Convert to bfloat16

Expand All @@ -425,7 +425,8 @@ def convert_lora_embedding_to_bit(lora_model, lora_config, lora_embedding_scale,

return lora_weights_uint8

def convert_total_lora_to_bit(encrypt_path, origin_model, lora_scale, lora_embedding_scale, lora_offset, args):

def convert_total_lora_to_bit(encrypt_path, origin_model, args):
if args.max_rank_num == 0:
raise ValueError(f"max_rank_num is equal to {args.max_rank_num}")
if args.max_embedding_rank_num == 0:
Expand All @@ -439,10 +440,10 @@ def convert_total_lora_to_bit(encrypt_path, origin_model, lora_scale, lora_embed
zero_prefix = np.zeros(64, dtype=np.uint8)
# lora embedding
lora_model, lora_config = load_lora_model(origin_model, args.lora_embedding_path)
lora_embedding_weights = convert_lora_embedding_to_bit(lora_model, lora_config, lora_embedding_scale, lora_offset, args)
lora_embedding_weights = convert_lora_embedding_to_bit(lora_model, lora_config, args)
# lora
lora_model, lora_config = load_lora_model(origin_model, args.lora_path)
lora_weights = convert_lora_to_bit(lora_model, lora_config, lora_scale, lora_offset, args)
lora_weights = convert_lora_to_bit(lora_model, lora_config, args)
total_lora_weights = np.concatenate([zero_prefix, lora_weights, lora_embedding_weights]) # 由于在bmodel中,lora_embedding放在后面,因此这里是lora,lora_embedding的顺序

# save and encrypt & decrypt
Expand Down Expand Up @@ -490,7 +491,7 @@ def convert():

# export lora model
print("Convert lora")
convert_total_lora_to_bit("encrypted_lora_weights.bin", origin_model, 1, 1, 0, args)
convert_total_lora_to_bit("encrypted_lora_weights.bin", origin_model, args)

print("Convert lora embedding")
convert_lora_embedding()
Expand Down
1 change: 0 additions & 1 deletion models/Qwen2/lora_demo/test_a16matmul.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ def test_a16matmul():
if not os.path.exists(dir_path):
os.makedirs(dir_path)

torch.manual_seed(0)
x = torch.randn(4, 512).float()

inputs = {'x': x.numpy()}
Expand Down
14 changes: 8 additions & 6 deletions models/Qwen2/lora_demo/test_block.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -89,15 +89,17 @@ def dequant(npz_file, op_name, q_group_size, hidden_size):
dequant_weight[1::2] = dequant_weights_low
return dequant_weight

def get_dequant_weight_dic(fp32_file, npz_file, fp32_op_name_list, op_name_list, op_shape_list, q_group_size, cos_sim_threshold):
def get_dequant_weight_dic(fp32_file, npz_file, fp32_op_name_list, op_name_list, op_shape_list, q_group_size, cos_sim_threshold, verify=True):
dequant_weight_dic = {}
for fp32_op_name, op_name, op_shape in zip(fp32_op_name_list, op_name_list, op_shape_list):
dequant_weight = dequant(npz_file, op_name, q_group_size, op_shape[1]) # 这里用op_shape[1]而不是HIDDEN_SIZE
fp32_weight = fp32_file[fp32_op_name].flatten()
dequant_bf16_weight = dequant_weight.reshape(op_shape).transpose(1,0).flatten()
cos_sim = cosine_similarity(fp32_weight, dequant_bf16_weight)
if cos_sim < cos_sim_threshold:
raise ValueError(f"cos_sim : {cos_sim}, failed")

if verify:
fp32_weight = fp32_file[fp32_op_name].flatten()
dequant_bf16_weight = dequant_weight.reshape(op_shape).transpose(1,0).flatten()
cos_sim = cosine_similarity(fp32_weight, dequant_bf16_weight)
if cos_sim < cos_sim_threshold:
raise ValueError(f"cos_sim : {cos_sim}, failed")
dequant_torch_weight = torch.FloatTensor(dequant_weight.reshape(op_shape))
dequant_weight_dic[op_name] = torch.nn.Parameter(dequant_torch_weight, requires_grad=False)
return dequant_weight_dic
Expand Down
Empty file modified models/Qwen2/lora_demo/test_llm.py
100644 → 100755
Empty file.
169 changes: 84 additions & 85 deletions models/Qwen2/lora_demo/test_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,42 +20,44 @@
from transformers import AutoModelForCausalLM, AutoTokenizer
torch.set_grad_enabled(False)

from export_onnx import load_model, load_lora_model, setup_environment, convert_total_lora_to_bit
from export_onnx import load_model, load_lora_model, setup_environment, encrypt_and_save, convert_lora_embedding_to_bit, convert_lora_to_bit
from test_a16matmul import uint16_to_float32, cosine_similarity
from test_block import get_dequant_weight_dic

def test_lora(lora_scale, lora_offset):
folder = f"./test_block"
q_group_size = 64
cos_sim_threshold = 0.98
print(f"lora_scale : {lora_scale}")
print(f"lora_offset : {lora_offset}")
def get_lora_model(origin_model, config_path):
import copy
from peft import LoraConfig, PeftModel, get_peft_model
# 1. load lora config
if not os.path.exists(config_path):
raise FileNotFoundError(f"Neither config.json nor adapter_config.json found in {path}")
with open(config_path) as f:
lora_config_dict = json.load(f)
lora_config = LoraConfig(**lora_config_dict)

lora_model, lora_config = load_lora_model(origin_model, args.lora_path)
for i in range(NUM_LAYERS):
# hook dequant weight from npz in compile
fp32_npz_name = f"{folder}/block_{i}_top_f32_all_weight.npz"
addressed_npz_name = f"{folder}/block_{i}_tpu_addressed_bm1684x_w4bf16_weight.npz"
fp32_file = np.load(fp32_npz_name)
npz_file = np.load(addressed_npz_name)
dequant_weight_dic = get_dequant_weight_dic(fp32_file, npz_file, fp32_op_name_list, op_name_list, op_shape_list, q_group_size, cos_sim_threshold)
lora_model = get_peft_model(copy.deepcopy(origin_model), lora_config)
return lora_model, lora_config

# assign dequant weight to model
cur_layer = lora_model.base_model.model.model.layers[i]
cur_layer.self_attn.q_proj.base_layer.weight = dequant_weight_dic[op_name_list[0]]
cur_layer.self_attn.k_proj.base_layer.weight = dequant_weight_dic[op_name_list[1]]
cur_layer.self_attn.v_proj.base_layer.weight = dequant_weight_dic[op_name_list[2]]
cur_layer.self_attn.o_proj.base_layer.weight = dequant_weight_dic[op_name_list[3]]

cur_layer.mlp.gate_proj.base_layer.weight = dequant_weight_dic[op_name_list[4]]
cur_layer.mlp.up_proj.base_layer.weight = dequant_weight_dic[op_name_list[5]]
cur_layer.mlp.down_proj.base_layer.weight = dequant_weight_dic[op_name_list[6]]
def create_lora_model(origin_model, config_path, lora_scale, lora_embedding_scale):
lora_model, lora_config = get_lora_model(origin_model, config_path)

for i in range(NUM_LAYERS):
cur_layer = lora_model.base_model.model.model.layers[i]
# assign lora weight to model
for name, module in cur_layer.named_modules():
if 'lora_A.default' in name or 'lora_B.default' in name:
if any(layer_name in name for layer_name in list(lora_config.target_modules)):
module.weight = torch.nn.Parameter((module.weight + lora_offset) * lora_scale, requires_grad=False)
torch_weight = torch.FloatTensor(np.random.randn(*module.weight.shape) * lora_scale)
module.weight = torch.nn.Parameter(torch_weight, requires_grad=False)

lora_embed = lora_model.base_model.model.model.embed_tokens
for name, module in lora_embed.named_modules():
if 'lora_embedding_A' in name or 'lora_embedding_B' in name:
torch_weight = torch.FloatTensor(np.random.randn(*module.default.shape) * lora_embedding_scale)
module.default = torch.nn.Parameter(torch_weight, requires_grad=False)
return lora_model, lora_config

def generate(model, prefix, dir_path):
from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask

prompt = "Give me a short introduction to large language model."
messages = [
Expand All @@ -67,50 +69,63 @@ def test_lora(lora_scale, lora_offset):
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = lora_model.generate(
model_inputs.input_ids,
max_new_tokens=20
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

def test_lora_embedding(lora_embedding_scale, lora_offset):
model_inputs = tokenizer([text], return_tensors="pt").to(device)
model.model.model.norm = nn.Identity()
hidden_states = model.model.model.forward(model_inputs.input_ids)[0]
np.save(f"{dir_path}/{prefix}_torch_hidden_states.npy", hidden_states.numpy())

def convert_total_lora_to_bit(encrypt_path, lora_model, lora_config, args):
if args.max_rank_num == 0:
raise ValueError(f"max_rank_num is equal to {args.max_rank_num}")
if args.max_embedding_rank_num == 0:
raise ValueError(f"max_embedding_rank_num is equal to {args.max_embedding_rank_num}")

# add zero to check after decrypt
zero_prefix = np.zeros(64, dtype=np.uint8)
# lora embedding
lora_embedding_weights = convert_lora_embedding_to_bit(lora_model, lora_config, args)
# lora
lora_weights = convert_lora_to_bit(lora_model, lora_config, args)
total_lora_weights = np.concatenate([zero_prefix, lora_weights, lora_embedding_weights]) # 由于在bmodel中,lora_embedding放在后面,因此这里是lora,lora_embedding的顺序

# encrypt
encrypt_and_save(total_lora_weights, encrypt_path, args)

def test_lora(lora_scale, lora_embedding_scale, dir_path):
folder = f"./test_block"
q_group_size = 64
cos_sim_threshold = 0.98
setup_environment()
print(f"\nlora_scale : {lora_scale}")
print(f"lora_embedding_scale : {lora_embedding_scale}")
print(f"lora_offset : {lora_offset}")

lora_model, lora_config = load_lora_model(origin_model, args.lora_embedding_path)
lora_embed = lora_model.base_model.model.model.embed_tokens
for name, module in lora_embed.named_modules():
if 'lora_embedding_A' in name or 'lora_embedding_B' in name:
module.default = torch.nn.Parameter((module.default + lora_offset) * lora_embedding_scale, requires_grad=False)
lora_model, lora_config = create_lora_model(origin_model, args.lora_config_path, lora_scale, lora_embedding_scale)
for i in range(NUM_LAYERS):
# hook dequant weight from npz in compile
fp32_npz_name = f"{folder}/block_{i}_top_f32_all_weight.npz"
addressed_npz_name = f"{folder}/block_{i}_tpu_addressed_bm1684x_w4bf16_weight.npz"
fp32_file = np.load(fp32_npz_name)
npz_file = np.load(addressed_npz_name)
dequant_weight_dic = get_dequant_weight_dic(fp32_file, npz_file, fp32_op_name_list, op_name_list, op_shape_list, q_group_size, cos_sim_threshold, verify=False)

prompt = "Give me a short introduction to large language model."
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = lora_model.generate(
model_inputs.input_ids,
max_new_tokens=20
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
# assign dequant weight to model
cur_layer = lora_model.base_model.model.model.layers[i]
cur_layer.self_attn.q_proj.base_layer.weight = dequant_weight_dic[op_name_list[0]]
cur_layer.self_attn.k_proj.base_layer.weight = dequant_weight_dic[op_name_list[1]]
cur_layer.self_attn.v_proj.base_layer.weight = dequant_weight_dic[op_name_list[2]]
cur_layer.self_attn.o_proj.base_layer.weight = dequant_weight_dic[op_name_list[3]]

cur_layer.mlp.gate_proj.base_layer.weight = dequant_weight_dic[op_name_list[4]]
cur_layer.mlp.up_proj.base_layer.weight = dequant_weight_dic[op_name_list[5]]
cur_layer.mlp.down_proj.base_layer.weight = dequant_weight_dic[op_name_list[6]]

prefix = f"scale{lora_scale}_embedding_scale{lora_embedding_scale}"
# generate
generate(lora_model, prefix=prefix, dir_path=dir_path)

# encrypt and save
convert_total_lora_to_bit(f"{dir_path}/{prefix}_encrypted_lora_weights.bin", lora_model, lora_config, args)


if __name__ == "__main__":
Expand All @@ -124,8 +139,7 @@ def test_lora_embedding(lora_embedding_scale, lora_offset):
parser.add_argument('--max_pos_len', type=int, default=8704, help="max position length")
parser.add_argument('--generation_mode', type=str, default="default", choices=["default", "lmhead_with_penalty", "lmhead_with_sample", "lmhead_with_top1"], help="generation mode")
parser.add_argument('--lib_path', type=str, default='', help='lib path by user')
parser.add_argument('--lora_path', type=str, default="", help="path to the lora model")
parser.add_argument('--lora_embedding_path', type=str, default="", help="path to the lora embedding model")
parser.add_argument('--lora_config_path', type=str, default="", help="path to the lora config")
parser.add_argument('--max_rank_num', type=int, default=0, help="the max rank for lora model")
parser.add_argument('--max_embedding_rank_num', type=int, default=0, help="the max rank for lora embedding model")
args = parser.parse_args()
Expand Down Expand Up @@ -181,25 +195,10 @@ def test_lora_embedding(lora_embedding_scale, lora_offset):
# create folder to store onnx
if not os.path.exists(dir_path):
os.makedirs(dir_path)
print("开始转化encrypted_lora_weights,用于test_pipeline.py")
convert_total_lora_to_bit(f"{dir_path}/encrypted_lora_weights_0_0.bin", origin_model, 0, 0, 0, args)
convert_total_lora_to_bit(f"{dir_path}/encrypted_lora_weights_1_0.bin", origin_model, 1, 0, 0, args)
convert_total_lora_to_bit(f"{dir_path}/encrypted_lora_weights_0_1.bin", origin_model, 0, 1, 0, args)
convert_total_lora_to_bit(f"{dir_path}/encrypted_lora_weights_10_0.bin", origin_model, 10, 0, 0.0001, args)
convert_total_lora_to_bit(f"{dir_path}/encrypted_lora_weights_20_0.bin", origin_model, 20, 0, 0.0001, args)
convert_total_lora_to_bit(f"{dir_path}/encrypted_lora_weights_30_0.bin", origin_model, 30, 0, 0.0001, args)
convert_total_lora_to_bit(f"{dir_path}/encrypted_lora_weights_0_10.bin", origin_model, 0, 10, 0.0001, args)
convert_total_lora_to_bit(f"{dir_path}/encrypted_lora_weights_0_20.bin", origin_model, 0, 20, 0.0001, args)
convert_total_lora_to_bit(f"{dir_path}/encrypted_lora_weights_0_30.bin", origin_model, 0, 30, 0.0001, args)

print("-------------------test_lora-------------------")
test_lora(lora_scale=1, lora_offset=0.0)
test_lora(lora_scale=10, lora_offset=0.0001)
test_lora(lora_scale=20, lora_offset=0.0001)
test_lora(lora_scale=30, lora_offset=0.0001)

print("-------------------test_lora_embedding-------------------")
test_lora_embedding(lora_embedding_scale=1, lora_offset=0.0)
test_lora_embedding(lora_embedding_scale=10, lora_offset=0.0001)
test_lora_embedding(lora_embedding_scale=20, lora_offset=0.0001)
test_lora_embedding(lora_embedding_scale=30, lora_offset=0.0001)
lora_scale_list = [0, 0.1, 0, 0.001, 0.005, 0.01]
lora_embedding_scale_list = [0, 0, 0.1, 0.001, 0.005, 0.01]
for lora_scale, lora_embedding_scale in zip(lora_scale_list, lora_embedding_scale_list):
test_lora(lora_scale=lora_scale, lora_embedding_scale=lora_embedding_scale, dir_path=dir_path)

Loading

0 comments on commit 9f5eae8

Please sign in to comment.