From b62604e2a7613ea9e36d044daedf09b41fa89682 Mon Sep 17 00:00:00 2001 From: gxlover0625 <2031808280@qq.com> Date: Wed, 16 Oct 2024 23:05:22 +0800 Subject: [PATCH 1/2] update the readme to add the demo of multi-turn conversations --- README.md | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/README.md b/README.md index 87de854..f1c0b16 100644 --- a/README.md +++ b/README.md @@ -612,6 +612,85 @@ model = Qwen2VLForConditionalGeneration.from_pretrained( ) ``` +### Multi-turn Conversation +There is an easy way to use Qwen2-VL for multi-turn conversation which supports pure text, single image, multi-images. You can use it as follows: + +First copy the class `Qwen2VL`. +```python +class Qwen2VL: + def __init__(self, model_path = None, max_new_tokens = 1024, min_pixels = 256*28*28, max_pixels = 1280*28*28): + self.model = Qwen2VLForConditionalGeneration.from_pretrained( + model_path, + torch_dtype="auto", + device_map="auto", + ) + self.processor = AutoProcessor.from_pretrained(model_path, min_pixels=min_pixels, max_pixels=max_pixels) + self.gen_config = { + "max_new_tokens": max_new_tokens, + } + + def parse_input(self, query=None, imgs=None): + if imgs is None: + messages = [{"role": "user", "content": query}] + return messages + + if isinstance(imgs, str): + imgs = [imgs] + content = [] + for img in imgs: + content.append({"type": "image", "image": img}) + content.append({"type": "text", "text": query}) + messages = [{"role": "user", "content": content}] + return messages + + def chat(self, query = None, imgs = None, history = None): + if history is None: + history = [] + + user_query = self.parse_input(query, imgs) + history.extend(user_query) + + text = self.processor.apply_chat_template(history, tokenize=False, add_generation_prompt=True, add_vision_id=True) + image_inputs, video_inputs = process_vision_info(history) + inputs = self.processor( + text=[text], + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pt", + ) + + inputs = inputs.to("cuda") + generated_ids = self.model.generate(**inputs, **self.gen_config) + generated_ids_trimmed = [ + out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) + ] + response = self.processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + )[0] + + history.append({"role": "assistant", "content": response}) + + del inputs, generated_ids, generated_ids_trimmed + torch.cuda.empty_cache() + gc.collect() + return response, history +``` +Then use the `chat` API in `Qwen2VL` class. +```python +chat_model = Qwen2VL(model_path="local path/repo id") + +# First turn +history = None +response, history = chat_model.chat(query="hello", history=history) +print(response, history) + +# Second turn +# For image type, (imgae_url, local_image_path, base64) +# For image count, ([image], [image1, image2], ...) +response, history = chat_model.chat(query="please describe the image", imgs=["image_url"], history=history) +print(response, history) +``` ### Try Qwen2-VL-72B with API! From 9bc2a9a7048e13784576d28e8d1dfb096ac88b54 Mon Sep 17 00:00:00 2001 From: gxlover0625 <2031808280@qq.com> Date: Wed, 16 Oct 2024 23:12:01 +0800 Subject: [PATCH 2/2] update the readme to add the demo of multi-turn conversations --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f1c0b16..5ba56ab 100644 --- a/README.md +++ b/README.md @@ -613,7 +613,7 @@ model = Qwen2VLForConditionalGeneration.from_pretrained( ``` ### Multi-turn Conversation -There is an easy way to use Qwen2-VL for multi-turn conversation which supports pure text, single image, multi-images. You can use it as follows: +There is an easy way to use Qwen2-VL for multi-turn conversations which supports pure text, single image, multi-images. You can use it as follows: First copy the class `Qwen2VL`. ```python @@ -676,7 +676,7 @@ class Qwen2VL: gc.collect() return response, history ``` -Then use the `chat` API in `Qwen2VL` class. +Then use the `chat` API in `Qwen2VL` class, `query` parameter is the user query in natural language format, `imgs` parameter is the image url or path or base64, `history` parameter is the history of the conversation. ```python chat_model = Qwen2VL(model_path="local path/repo id")