Skip to content

Commit

Permalink
分析pretrain和sft的token数量分布
Browse files Browse the repository at this point in the history
  • Loading branch information
Coobiw committed Mar 14, 2024
1 parent 8857382 commit 70a797f
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 0 deletions.
49 changes: 49 additions & 0 deletions tokenize_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import json
from transformers import AutoTokenizer

import matplotlib.pyplot as plt
from tqdm import tqdm

llm_tokenizer = AutoTokenizer.from_pretrained(
"cache/ckpt/Qwen-14B-Chat",
cache_dir="cache",
padding_side="right",
use_fast=False,
trust_remote_code=True
)
llm_tokenizer.pad_token_id = llm_tokenizer.eod_id

with open("cache/dataset/llava_pretrain/blip_laion_cc_sbu_558k/llava_pretrain_minigpt4qwen_format.json",'r') as f:
pretrain_data = json.load(f)
with open("cache/dataset/llava_instruct/llava_instruction_100k.json",'r') as f:
sft_data = json.load(f)


token_nums = []
im_start = "<|im_start|>"
im_end = "<|im_end|>"

plot_title = ["Pretrain", "SFT"]
num_image = 32
for i,datas in enumerate([pretrain_data, sft_data]):
token_nums = []
for data in tqdm(datas):
question = data["instruction"].replace("<Img><ImageHere></Img> ","")
answer = data["output"]
system_message = im_start + "system\nYou are a helpful assistant." + im_end + "\n"
user_message = im_start + f"user\n{question}" + im_end + "\n"
assistant_message = im_start + f"assistant\n{answer}" + im_end + "\n"
whole_text = system_message + user_message + assistant_message

token_nums.append(len(llm_tokenizer(whole_text).input_ids) + num_image)

plt.hist(token_nums, bins=20, edgecolor='black')
plt.title(f'Token Lengths Histogram of {plot_title[i]}')
plt.xlabel('Token Length')
plt.ylabel('Frequency')
plt.savefig(f"./vis/{plot_title[i]}_token_distribution.png")

plt.close()

print(f"Max Tokens in {plot_title[i]} Stage:\t{max(token_nums)}")

Binary file added vis/Pretrain_token_distribution.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added vis/SFT_token_distribution.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 70a797f

Please sign in to comment.