diff --git a/tokenize_analysis.py b/tokenize_analysis.py
new file mode 100644
index 0000000..05451b7
--- /dev/null
+++ b/tokenize_analysis.py
@@ -0,0 +1,49 @@
+import json
+from transformers import AutoTokenizer
+
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+
+llm_tokenizer = AutoTokenizer.from_pretrained(
+ "cache/ckpt/Qwen-14B-Chat",
+ cache_dir="cache",
+ padding_side="right",
+ use_fast=False,
+ trust_remote_code=True
+ )
+llm_tokenizer.pad_token_id = llm_tokenizer.eod_id
+
+with open("cache/dataset/llava_pretrain/blip_laion_cc_sbu_558k/llava_pretrain_minigpt4qwen_format.json",'r') as f:
+ pretrain_data = json.load(f)
+with open("cache/dataset/llava_instruct/llava_instruction_100k.json",'r') as f:
+ sft_data = json.load(f)
+
+
+token_nums = []
+im_start = "<|im_start|>"
+im_end = "<|im_end|>"
+
+plot_title = ["Pretrain", "SFT"]
+num_image = 32
+for i,datas in enumerate([pretrain_data, sft_data]):
+ token_nums = []
+ for data in tqdm(datas):
+ question = data["instruction"].replace(" ","")
+ answer = data["output"]
+ system_message = im_start + "system\nYou are a helpful assistant." + im_end + "\n"
+ user_message = im_start + f"user\n{question}" + im_end + "\n"
+ assistant_message = im_start + f"assistant\n{answer}" + im_end + "\n"
+ whole_text = system_message + user_message + assistant_message
+
+ token_nums.append(len(llm_tokenizer(whole_text).input_ids) + num_image)
+
+ plt.hist(token_nums, bins=20, edgecolor='black')
+ plt.title(f'Token Lengths Histogram of {plot_title[i]}')
+ plt.xlabel('Token Length')
+ plt.ylabel('Frequency')
+ plt.savefig(f"./vis/{plot_title[i]}_token_distribution.png")
+
+ plt.close()
+
+ print(f"Max Tokens in {plot_title[i]} Stage:\t{max(token_nums)}")
+
diff --git a/vis/Pretrain_token_distribution.png b/vis/Pretrain_token_distribution.png
new file mode 100644
index 0000000..8acf3c4
Binary files /dev/null and b/vis/Pretrain_token_distribution.png differ
diff --git a/vis/SFT_token_distribution.png b/vis/SFT_token_distribution.png
new file mode 100644
index 0000000..4cf581f
Binary files /dev/null and b/vis/SFT_token_distribution.png differ