diff --git a/README.md b/README.md
index 1358856ae..5612a8034 100644
--- a/README.md
+++ b/README.md
@@ -65,20 +65,21 @@ applications in a centralized programming manner for streamlined development.
 AgentScope provides a list of `ModelWrapper` to support both local model
 services and third-party model APIs.
 
-| API                    | Task            | Model Wrapper                                                                                                                   | Example Configuration                                                                       | Some Supported Models                     |
-|------------------------|-----------------|---------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------|-------------------------------------------|
-| OpenAI API             | Chat            | [`OpenAIChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)                 | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api)       | gpt-4, gpt-3.5-turbo, ...                 |
-|                        | Embedding       | [`OpenAIEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)            | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api)       | text-embedding-ada-002, ...               |
-|                        | DALL·E          | [`OpenAIDALLEWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)                | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api)       | dall-e-2, dall-e-3                        |
-| DashScope API          | Chat            | [`DashScopeChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)           | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api)    | qwen-plus, qwen-max, ...                  |
-|                        | Image Synthesis | [`DashScopeImageSynthesisWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api)    | wanx-v1                                   |
-|                        | Text Embedding  | [`DashScopeTextEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)  | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api)    | text-embedding-v1, text-embedding-v2, ... |
-| Gemini API             | Chat            | [`GeminiChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py)                 | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#gemini-api)       | gemini-pro, ...                           |
-|                        | Embedding       | [`GeminiEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py)            | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#gemini-api)       | models/embedding-001, ...                 |
-| ollama                 | Chat            | [`OllamaChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)                 | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api)       | llama2, Mistral, ...                      |
-|                        | Embedding       | [`OllamaEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)            | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api)       | llama2, Mistral, ...                      |
-|                        | Generation      | [`OllamaGenerationWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)           | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api)       | llama2, Mistral, ...                      |
-| Post Request based API | -               | [`PostAPIModelWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/post_model.py)                 | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#post-request-api) | -                                         |
+| API                    | Task            | Model Wrapper                                                                                                                   | Example Configuration                                                                       | Some Supported Models                        |
+|------------------------|-----------------|---------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------|----------------------------------------------|
+| OpenAI API             | Chat            | [`OpenAIChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)                 | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api)       | gpt-4, gpt-3.5-turbo, ...                    |
+|                        | Embedding       | [`OpenAIEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)            | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api)       | text-embedding-ada-002, ...                  |
+|                        | DALL·E          | [`OpenAIDALLEWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)                | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api)       | dall-e-2, dall-e-3                           |
+| DashScope API          | Chat            | [`DashScopeChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)           | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api)    | qwen-plus, qwen-max, ...                     |
+|                        | Image Synthesis | [`DashScopeImageSynthesisWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api)    | wanx-v1                                      |
+|                        | Text Embedding  | [`DashScopeTextEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)  | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api)    | text-embedding-v1, text-embedding-v2, ...    |
+|                        | Multimodal      | [`DashScopeMultiModalWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)     | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api)    | qwen-vl-v1, qwen-vl-chat-v1, qwen-audio-chat |
+| Gemini API             | Chat            | [`GeminiChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py)                 | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#gemini-api)       | gemini-pro, ...                              |
+|                        | Embedding       | [`GeminiEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py)            | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#gemini-api)       | models/embedding-001, ...                    |
+| ollama                 | Chat            | [`OllamaChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)                 | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api)       | llama2, Mistral, ...                         |
+|                        | Embedding       | [`OllamaEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)            | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api)       | llama2, Mistral, ...                         |
+|                        | Generation      | [`OllamaGenerationWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)           | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api)       | llama2, Mistral, ...                         |
+| Post Request based API | -               | [`PostAPIModelWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/post_model.py)                 | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#post-request-api) | -                                            |
 
 **Supported Local Model Deployment**
 
diff --git a/README_ZH.md b/README_ZH.md
index 745450e22..979dc246d 100644
--- a/README_ZH.md
+++ b/README_ZH.md
@@ -54,20 +54,21 @@ AgentScope是一个创新的多智能体开发平台，旨在赋予开发人员
 
 AgentScope提供了一系列`ModelWrapper`来支持本地模型服务和第三方模型API。
 
-| API                    | Task            | Model Wrapper                                                                                                                   | Example Configuration                                                                       | Some Supported Models                     |
-|------------------------|-----------------|---------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------|-------------------------------------------|
-| OpenAI API             | Chat            | [`OpenAIChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)                 | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api)       | gpt-4, gpt-3.5-turbo, ...                 |
-|                        | Embedding       | [`OpenAIEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)            | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api)       | text-embedding-ada-002, ...               |
-|                        | DALL·E          | [`OpenAIDALLEWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)                | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api)       | dall-e-2, dall-e-3                        |
-| DashScope API          | Chat            | [`DashScopeChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)           | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api)    | qwen-plus, qwen-max, ...                  |
-|                        | Image Synthesis | [`DashScopeImageSynthesisWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api)    | wanx-v1                                   |
-|                        | Text Embedding  | [`DashScopeTextEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)  | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api)    | text-embedding-v1, text-embedding-v2, ... |
-| Gemini API             | Chat            | [`GeminiChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py)                 | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#gemini-api)       | gemini-pro, ...                           |
-|                        | Embedding       | [`GeminiEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py)            | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#gemini-api)       | models/embedding-001, ...                 |
-| ollama                 | Chat            | [`OllamaChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)                 | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api)       | llama2, Mistral, ...                      |
-|                        | Embedding       | [`OllamaEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)            | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api)       | llama2, Mistral, ...                      |
-|                        | Generation      | [`OllamaGenerationWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)           | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api)       | llama2, Mistral, ...                      |
-| Post Request based API | -               | [`PostAPIModelWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/post_model.py)                 | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#post-request-api) | -                                         |
+| API                    | Task            | Model Wrapper                                                                                                                   | Example Configuration                                                                       | Some Supported Models                        |
+|------------------------|-----------------|---------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------|----------------------------------------------|
+| OpenAI API             | Chat            | [`OpenAIChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)                 | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api)       | gpt-4, gpt-3.5-turbo, ...                    |
+|                        | Embedding       | [`OpenAIEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)            | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api)       | text-embedding-ada-002, ...                  |
+|                        | DALL·E          | [`OpenAIDALLEWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)                | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api)       | dall-e-2, dall-e-3                           |
+| DashScope API          | Chat            | [`DashScopeChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)           | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api)    | qwen-plus, qwen-max, ...                     |
+|                        | Image Synthesis | [`DashScopeImageSynthesisWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api)    | wanx-v1                                      |
+|                        | Text Embedding  | [`DashScopeTextEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)  | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api)    | text-embedding-v1, text-embedding-v2, ...    |
+|                        | Multimodal      | [`DashScopeMultiModalWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)     | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api)    | qwen-vl-v1, qwen-vl-chat-v1, qwen-audio-chat |
+| Gemini API             | Chat            | [`GeminiChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py)                 | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#gemini-api)       | gemini-pro, ...                              |
+|                        | Embedding       | [`GeminiEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py)            | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#gemini-api)       | models/embedding-001, ...                    |
+| ollama                 | Chat            | [`OllamaChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)                 | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api)       | llama2, Mistral, ...                         |
+|                        | Embedding       | [`OllamaEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)            | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api)       | llama2, Mistral, ...                         |
+|                        | Generation      | [`OllamaGenerationWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)           | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api)       | llama2, Mistral, ...                         |
+| Post Request based API | -               | [`PostAPIModelWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/post_model.py)                 | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#post-request-api) | -                                            |
 
 **支持的本地模型部署**
 
diff --git a/docs/sphinx_doc/en/source/tutorial/203-model.md b/docs/sphinx_doc/en/source/tutorial/203-model.md
index 372ce5358..5c3cb0b7e 100644
--- a/docs/sphinx_doc/en/source/tutorial/203-model.md
+++ b/docs/sphinx_doc/en/source/tutorial/203-model.md
@@ -70,20 +70,21 @@ class OpenAIChatWrapper(OpenAIWrapperBase):
 In the current AgentScope, the supported `model_type` types, the corresponding
 `ModelWrapper` classes, and the supported APIs are as follows:
 
-| API                    | Task            | Model Wrapper                                                                                                                   | `model_type`                  |
-|------------------------|-----------------|---------------------------------------------------------------------------------------------------------------------------------|-------------------------------|
-| OpenAI API             | Chat            | [`OpenAIChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)                 | `"openai"`                    |
-|                        | Embedding       | [`OpenAIEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)            | `"openai_embedding"`          |
-|                        | DALL·E          | [`OpenAIDALLEWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)                | `"openai_dall_e"`             |
-| DashScope API          | Chat            | [`DashScopeChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)           | `"dashscope_chat"`            |
-|                        | Image Synthesis | [`DashScopeImageSynthesisWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_image_synthesis"` |
-|                        | Text Embedding  | [`DashScopeTextEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)  | `"dashscope_text_embedding"`  |
-| Gemini API             | Chat            | [`GeminiChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py)                 | `"gemini_chat"`               |
-|                        | Embedding       | [`GeminiEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py)            | `"gemini_embedding"`          |
-| ollama                 | Chat            | [`OllamaChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)                 | `"ollama_chat"`               |
-|                        | Embedding       | [`OllamaEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)                   | `"ollama_embedding"`          |
-|                        | Generation      | [`OllamaGenerationWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)           | `"ollama_generate"`           |
-| Post Request based API | -               | [`PostAPIModelWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/post_model.py)                 | `"post_api"`                  |
+| API                    | Task            | Model Wrapper                                                                                                                   | `model_type`                  | Some Supported Models                            |
+|------------------------|-----------------|---------------------------------------------------------------------------------------------------------------------------------|-------------------------------|--------------------------------------------------|
+| OpenAI API             | Chat            | [`OpenAIChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)                 | `"openai"`                    | gpt-4, gpt-3.5-turbo, ...                        |
+|                        | Embedding       | [`OpenAIEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)            | `"openai_embedding"`          | text-embedding-ada-002, ...                      |
+|                        | DALL·E          | [`OpenAIDALLEWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)                | `"openai_dall_e"`             | dall-e-2, dall-e-3                               |
+| DashScope API          | Chat            | [`DashScopeChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)           | `"dashscope_chat"`            | qwen-plus, qwen-max, ...                         |
+|                        | Image Synthesis | [`DashScopeImageSynthesisWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_image_synthesis"` | wanx-v1                                          |
+|                        | Text Embedding  | [`DashScopeTextEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)  | `"dashscope_text_embedding"`  | text-embedding-v1, text-embedding-v2, ...        |
+|                        | Multimodal      | [`DashScopeMultiModalWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)     | `"dashscope_multimodal"`      | qwen-vl-plus, qwen-vl-max, qwen-audio-turbo, ... |
+| Gemini API             | Chat            | [`GeminiChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py)                 | `"gemini_chat"`               | gemini-pro, ...                                  |
+|                        | Embedding       | [`GeminiEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py)            | `"gemini_embedding"`          | models/embedding-001, ...                        |
+| ollama                 | Chat            | [`OllamaChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)                 | `"ollama_chat"`               | llama2, ...                                      |
+|                        | Embedding       | [`OllamaEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)            | `"ollama_embedding"`          | llama2, ...                                      |
+|                        | Generation      | [`OllamaGenerationWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)           | `"ollama_generate"`           | llama2, ...                                      |
+| Post Request based API | -               | [`PostAPIModelWrapperBase`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/post_model.py)             | `"post_api"`                  | -                                                |
 
 #### Detailed Parameters
 
@@ -238,6 +239,27 @@ openai_chat_config = {
 
 </details>
 
+<details>
+<summary>DashScope Multimodal Conversation API (<code><a href="https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py">agentscope.models.DashScopeMultiModalWrapper</a></code>)</summary>
+
+```python
+{
+    "config_name": "my_dashscope_multimodal_config",
+    "model_type": "dashscope_multimodal",
+
+    # Required parameters
+    "model_name": "{model_name}",              # The model name in DashScope Multimodal Conversation API, e.g. qwen-vl-plus
+
+    # Optional parameters
+    "api_key": "{your_api_key}",
+    "generate_args": {
+        # ...
+    },
+}
+```
+
+</details>
+
 <br/>
 
 #### Gemini API
diff --git a/docs/sphinx_doc/en/source/tutorial/206-prompt.md b/docs/sphinx_doc/en/source/tutorial/206-prompt.md
index 77ee006c7..43a6e18fc 100644
--- a/docs/sphinx_doc/en/source/tutorial/206-prompt.md
+++ b/docs/sphinx_doc/en/source/tutorial/206-prompt.md
@@ -38,17 +38,21 @@ following built-in strategies for most chat and generation related model APIs.
 In AgentScope, we provide built-in strategies for the following chat and
 generation model APIs.
 
-- [`OpenAIChatWrapper`](#openaichatwrapper)
-- [`DashScopeChatWrapper`](#dashscopechatwrapper)
-- [`OllamaChatWrapper`](#ollamachatwrapper)
-- [`OllamaGenerationWrapper`](ollamagenerationwrapper)
-- [`GeminiChatWrapper`](#geminiwrapper)
+- [OpenAIChatWrapper](#openaichatwrapper)
+- [DashScopeChatWrapper](#dashscopechatwrapper)
+- [DashScopeMultiModalWrapper](#dashscopemultimodalwrapper)
+- [OllamaChatWrapper](#ollamachatwrapper)
+- [OllamaGenerationWrapper](#ollamagenerationwrapper)
+- [GeminiChatWrapper](#geminichatwrapper)
 
 These strategies are implemented in the `format` functions of the model
 wrapper classes.
 It accepts `Msg` objects, a list of `Msg` objects, or their mixture as input.
+However, `format` function will first reorganize them into a list of `Msg`
+objects, so for simplicity in the following sections we treat the input as a
+list of `Msg` objects.
 
-### `OpenAIChatWrapper`
+### OpenAIChatWrapper
 
 `OpenAIChatWrapper` encapsulates the OpenAI chat API, it takes a list of
 dictionaries as input, where the dictionary must obey the following rules
@@ -95,7 +99,7 @@ print(prompt)
 ]
 ```
 
-### `DashScopeChatWrapper`
+### DashScopeChatWrapper
 
 `DashScopeChatWrapper` encapsulates the DashScope chat API, which takes a list of messages as input. The message must obey the following rules (updated in 2024/03/22):
 
@@ -138,7 +142,91 @@ print(prompt)
 ]
 ```
 
-### `OllamaChatWrapper`
+### DashScopeMultiModalWrapper
+
+`DashScopeMultiModalWrapper` encapsulates the DashScope multimodal conversation API, which takes a list of messages as input. The message must obey the following rules (updated in 2024/04/04):
+
+- Each message is a dictionary with `role` and `content` fields.
+  - The `role` field must be either `"user"`, `"system"`, or `"assistant"`.
+  - The `content` field must be a list of dictionaries, where
+    - Each dictionary only contains one key-value pair, whose key must be `text`, `image` or `audio`.
+    - `text` field is a string, representing the text content.
+    - `image` field is a string, representing the image url.
+    - `audio` field is a string, representing the audio url.
+    - The `content` field can contain multiple dictionaries with the key `image` or multiple dictionaries with the key `audio` at the same time. For example:
+```python
+[
+    {
+        "role": "user",
+        "content": [
+            {"text": "What's the difference between these two pictures?"},
+            {"image": "https://xxx1.png"},
+            {"image": "https://xxx2.png"}
+        ]
+    },
+    {
+        "role": "assistant",
+        "content": [{"text": "The first picture is a cat, and the second picture is a dog."}]
+    },
+    {
+        "role": "user",
+        "content": [{"text": "I see, thanks!"}]
+    }
+]
+```
+- The message with the `role` field as `"system"` must and can only be the first message in the list.
+- The last message must have the `role` field as `"user"`.
+- The `user` and `assistant` messages must alternate.
+
+#### Prompt Strategy
+
+Based on the above rules, the `format` function in `DashScopeMultiModalWrapper` will parse the input messages as follows:
+
+- If the first message in the input message list has a `role` field with the value `"system"`, it will be converted into a system message with the `role` field as `"system"` and the `content` field as the system message. If the `url` field in the input `Msg` object is not `None`, a dictionary with the key `"image"` or `"audio"` will be added to the `content` based on its type.
+- The rest of the messages will be converted into a message with the `role` field as `"user"` and the `content` field as the dialogue history. For each message, if their `url` field is not `None`, it will add a dictionary with the key `"image"` or `"audio"` to the `content` based on the file type that the `url` points to.
+
+An example:
+
+```python
+from agentscope.models import DashScopeMultiModalWrapper
+from agentscope.message import Msg
+
+model = DashScopeMultiModalWrapper(
+    config_name="", # empty since we directly initialize the model wrapper
+    model_name="qwen-vl-plus",
+)
+
+prompt = model.format(
+   Msg("system", "You're a helpful assistant", role="system", url="url_to_png1"),   # Msg object
+   [                                                                                # a list of Msg objects
+      Msg(name="Bob", content="Hi!", role="assistant", url="url_to_png2"),
+      Msg(name="Alice", content="Nice to meet you!", role="assistant", url="url_to_png3"),
+   ],
+)
+print(prompt)
+```
+
+```bash
+[
+  {
+    "role": "system",
+    "content": [
+      {"text": "You are a helpful assistant"},
+      {"image": "url_to_png1"}
+    ]
+  },
+  {
+    "role": "user",
+    "content": [
+      {"text": "## Dialogue History\nBob: Hi!\nAlice: Nice to meet you!"},
+      {"image": "url_to_png2"},
+      {"image": "url_to_png3"},
+    ]
+  }
+]
+```
+
+### OllamaChatWrapper
 
 `OllamaChatWrapper` encapsulates the Ollama chat API, which takes a list of
 messages as input. The message must obey the following rules (updated in
@@ -183,7 +271,7 @@ print(prompt)
 ]
 ```
 
-### `OllamaGenerationWrapper`
+### OllamaGenerationWrapper
 
 `OllamaGenerationWrapper` encapsulates the Ollama generation API, which
 takes a string prompt as input without any constraints (updated to 2024/03/22).
diff --git a/docs/sphinx_doc/zh_CN/source/tutorial/203-model.md b/docs/sphinx_doc/zh_CN/source/tutorial/203-model.md
index fc30bc076..99c4dec93 100644
--- a/docs/sphinx_doc/zh_CN/source/tutorial/203-model.md
+++ b/docs/sphinx_doc/zh_CN/source/tutorial/203-model.md
@@ -90,20 +90,21 @@ class OpenAIChatWrapper(OpenAIWrapper):
 在目前的AgentScope中，所支持的`model_type`类型，对应的`ModelWrapper`类，以及支持的
 API如下：
 
-| API                    | Task            | Model Wrapper                                                                                                                   | `model_type`                  |
-|------------------------|-----------------|---------------------------------------------------------------------------------------------------------------------------------|-------------------------------|
-| OpenAI API             | Chat            | [`OpenAIChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)                 | `"openai"`                    |
-|                        | Embedding       | [`OpenAIEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)            | `"openai_embedding"`          |
-|                        | DALL·E          | [`OpenAIDALLEWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)                | `"openai_dall_e"`             |
-| DashScope API          | Chat            | [`DashScopeChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)           | `"dashscope_chat"`            |
-|                        | Image Synthesis | [`DashScopeImageSynthesisWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_image_synthesis"` |
-|                        | Text Embedding  | [`DashScopeTextEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)  | `"dashscope_text_embedding"`  |
-| Gemini API             | Chat            | [`GeminiChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py)                 | `"gemini_chat"`               |
-|                        | Embedding       | [`GeminiEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py)            | `"gemini_embedding"`          |
-| ollama                 | Chat            | [`OllamaChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)                 | `"ollama_chat"`               |
-|                        | Embedding       | [`OllamaEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)                   | `"ollama_embedding"`          |
-|                        | Generation      | [`OllamaGenerationWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)           | `"ollama_generate"`           |
-| Post Request based API | -               | [`PostAPIModelWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/post_model.py)                 | `"post_api"`                  |
+| API                    | Task            | Model Wrapper                                                                                                                   | `model_type`                  | Some Supported Models                            |
+|------------------------|-----------------|---------------------------------------------------------------------------------------------------------------------------------|-------------------------------|--------------------------------------------------|
+| OpenAI API             | Chat            | [`OpenAIChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)                 | `"openai"`                    | gpt-4, gpt-3.5-turbo, ...                        |
+|                        | Embedding       | [`OpenAIEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)            | `"openai_embedding"`          | text-embedding-ada-002, ...                      |
+|                        | DALL·E          | [`OpenAIDALLEWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py)                | `"openai_dall_e"`             | dall-e-2, dall-e-3                               |
+| DashScope API          | Chat            | [`DashScopeChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)           | `"dashscope_chat"`            | qwen-plus, qwen-max, ...                         |
+|                        | Image Synthesis | [`DashScopeImageSynthesisWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_image_synthesis"` | wanx-v1                                          |
+|                        | Text Embedding  | [`DashScopeTextEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)  | `"dashscope_text_embedding"`  | text-embedding-v1, text-embedding-v2, ...        |
+|                        | Multimodal      | [`DashScopeMultiModalWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py)     | `"dashscope_multimodal"`      | qwen-vl-plus, qwen-vl-max, qwen-audio-turbo, ... |
+| Gemini API             | Chat            | [`GeminiChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py)                 | `"gemini_chat"`               | gemini-pro, ...                                  |
+|                        | Embedding       | [`GeminiEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py)            | `"gemini_embedding"`          | models/embedding-001, ...                        |
+| ollama                 | Chat            | [`OllamaChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)                 | `"ollama_chat"`               | llama2, ...                                      |
+|                        | Embedding       | [`OllamaEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)            | `"ollama_embedding"`          | llama2, ...                                      |
+|                        | Generation      | [`OllamaGenerationWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py)           | `"ollama_generate"`           | llama2, ...                                      |
+| Post Request based API | -               | [`PostAPIModelWrapperBase`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/post_model.py)             | `"post_api"`                  | -                                                |
 
 #### 详细参数
 
@@ -258,6 +259,27 @@ openai_chat_config = {
 
 </details>
 
+<details>
+<summary>DashScope Multimodal Conversation API (<code><a href="https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py">agentscope.models.DashScopeMultiModalWrapper</a></code>)</summary>
+
+```python
+{
+    "config_name": "my_dashscope_multimodal_config",
+    "model_type": "dashscope_multimodal",
+
+    # Required parameters
+    "model_name": "{model_name}",               # The model name in DashScope Multimodal Conversation API, e.g. qwen-vl-plus
+
+    # Optional parameters
+    "api_key": "{your_api_key}",
+    "generate_args": {
+        # ...
+    },
+}
+```
+
+</details>
+
 <br/>
 
 #### Gemini API
@@ -417,8 +439,8 @@ AgentScope允许开发者自定义自己的模型包装器。新的模型包装
 ```python
 from agentscope.models import ModelWrapperBase
 
-class MyModelWrapper(ModelWrapperBase):
 
+class MyModelWrapper(ModelWrapperBase):
     model_type: str = "my_model"
 
     def __init__(self, config_name, my_arg1, my_arg2, **kwargs):
diff --git a/docs/sphinx_doc/zh_CN/source/tutorial/206-prompt.md b/docs/sphinx_doc/zh_CN/source/tutorial/206-prompt.md
index 7888ee548..1bd05ad4e 100644
--- a/docs/sphinx_doc/zh_CN/source/tutorial/206-prompt.md
+++ b/docs/sphinx_doc/zh_CN/source/tutorial/206-prompt.md
@@ -23,13 +23,14 @@ AgentScope内置策略的目标是**使初学者能够顺利调用模型API ，
 
 AgentScope为以下的模型API提供了内置的提示构建策略。
 
-- [`OpenAIChatWrapper`](#openaichatwrapper)
-- [`DashScopeChatWrapper`](#dashscopechatwrapper)
-- [`OllamaChatWrapper`](#ollamachatwrapper)
-- [`OllamaGenerationWrapper`](ollamagenerationwrapper)
-- [`GeminiChatWrapper`](#geminiwrapper)
+- [OpenAIChatWrapper](#openaichatwrapper)
+- [DashScopeChatWrapper](#dashscopechatwrapper)
+- [DashScopeMultiModalWrapper](#dashscopemultimodalwrapper)
+- [OllamaChatWrapper](#ollamachatwrapper)
+- [OllamaGenerationWrapper](#ollamagenerationwrapper)
+- [GeminiChatWrapper](#geminichatwrapper)
 
-这些策略是在对应Model Wrapper类的`format`函数中实现的。它接受`Msg`对象，`Msg`对象的列表或它们的混合作为输入。
+这些策略是在对应Model Wrapper类的`format`函数中实现的。它接受`Msg`对象，`Msg`对象的列表或它们的混合作为输入。在`format`函数将会把输入重新组织成一个`Msg`对象的列表，因此为了方便解释，我们在下面的章节中认为`format`函数的输入是`Msg`对象的列表。
 
 ### `OpenAIChatWrapper`
 
@@ -115,6 +116,89 @@ print(prompt)
 ]
 ```
 
+### `DashScopeMultiModalWrapper`
+
+`DashScopeMultiModalWrapper`封装了多模态模型的API，它接受消息列表作为输入，并且必须遵循以下的规则(更新于2024/04/04):
+
+- 每个消息是一个字段，并且包含`role`和`content`字段。
+  - 其中`role`字段取值必须是`"user"`，`"system"`，`"assistant"`之一。
+  - `content`字段对应的值必须是字典的列表
+    - 每个字典只包含`text`，`image`或`audio`中的一个键值对
+    - `text`域对应的值是一个字符串，表示文本内容
+    - `image`域对应的值是一个字符串，表示图片的url
+    - `audio`域对应的值是一个字符串，表示音频的url
+    - `content`中可以同时包含多个key为`image`的字典或者多个key为`audio`的字典。例如
+```python
+[
+    {
+        "role": "user",
+        "content": [
+            {"text": "What's the difference between these two pictures?"},
+            {"image": "https://xxx1.png"},
+            {"image": "https://xxx2.png"}
+        ]
+    },
+    {
+        "role": "assistant",
+        "content": [{"text": "The first picture is a cat, and the second picture is a dog."}]
+    },
+    {
+        "role": "user",
+        "content": [{"text": "I see, thanks!"}]
+    }
+]
+```
+- 如果一条信息的`role`字段是`"system"`，那么这条信息必须也只能出现在消息列表的开头。
+- 消息列表中最后一条消息的`role`字段必须是`"user"`。
+- 消息列表中`user`和`assistant`必须交替发言。
+
+#### 提示的构建策略
+
+基于上述API的限制，构建策略如下：
+- 如果输入的消息列表中第一条消息的`role`字段的值是`"system"`，它将被转换为一条系统消息，其中`role`字段为`"system"`，`content`字段为系统消息，如果输入`Msg`对象中`url`属性不为`None`，则根据其类型在`content`中增加一个键值为`"image"`或者`"audio"`的字典。
+- 其余的消息将被转换为一条消息，其中`role`字段为`"user"`，`content`字段为对话历史。并且所有`Msg`对象中`url`属性不为`None`的消息，都会根据`url`指向的文件类型在`content`中增加一个键值为`"image"`或者`"audio"`的字典。
+
+样例如下：
+
+```python
+from agentscope.models import DashScopeMultiModalWrapper
+from agentscope.message import Msg
+
+model = DashScopeMultiModalWrapper(
+    config_name="", # 我们直接初始化model wrapper，因此不需要填入config_name
+    model_name="qwen-vl-plus",
+)
+
+prompt = model.format(
+   Msg("system", "You're a helpful assistant", role="system", url="url_to_png1"),   # Msg对象
+   [                                                                                # Msg对象的列表
+      Msg(name="Bob", content="Hi!", role="assistant", url="url_to_png2"),
+      Msg(name="Alice", content="Nice to meet you!", role="assistant", url="url_to_png3"),
+   ],
+)
+print(prompt)
+```
+
+```bash
+[
+  {
+    "role": "system",
+    "content": [
+      {"text": "You are a helpful assistant"},
+      {"image": "url_to_png1"}
+    ]
+  },
+  {
+    "role": "user",
+    "content": [
+      {"text": "## Dialogue History\nBob: Hi!\nAlice: Nice to meet you!"},
+      {"image": "url_to_png2"},
+      {"image": "url_to_png3"},
+    ]
+  }
+]
+```
+
 ### `OllamaChatWrapper`
 
 `OllamaChatWrapper`封装了Ollama聊天API，它接受消息列表作为输入。消息必须遵守以下规则(更新于2024/03/22)：
diff --git a/src/agentscope/models/__init__.py b/src/agentscope/models/__init__.py
index 2984228d4..d7a98147e 100644
--- a/src/agentscope/models/__init__.py
+++ b/src/agentscope/models/__init__.py
@@ -26,6 +26,7 @@
     DashScopeChatWrapper,
     DashScopeImageSynthesisWrapper,
     DashScopeTextEmbeddingWrapper,
+    DashScopeMultiModalWrapper,
 )
 from .ollama_model import (
     OllamaChatWrapper,
@@ -55,6 +56,7 @@
     "DashScopeChatWrapper",
     "DashScopeImageSynthesisWrapper",
     "DashScopeTextEmbeddingWrapper",
+    "DashScopeMultiModalWrapper",
     "OllamaChatWrapper",
     "OllamaEmbeddingWrapper",
     "OllamaGenerationWrapper",
diff --git a/src/agentscope/models/dashscope_model.py b/src/agentscope/models/dashscope_model.py
index dc82b0831..c43429d0e 100644
--- a/src/agentscope/models/dashscope_model.py
+++ b/src/agentscope/models/dashscope_model.py
@@ -1,12 +1,13 @@
 # -*- coding: utf-8 -*-
 """Model wrapper for DashScope models"""
+import os
 from abc import ABC
 from http import HTTPStatus
 from typing import Any, Union, List, Sequence
 from loguru import logger
 
 from ..message import MessageBase
-from ..utils.tools import _convert_to_str
+from ..utils.tools import _convert_to_str, _guess_type_by_extension
 
 try:
     import dashscope
@@ -75,7 +76,9 @@ def format(
 
 
 class DashScopeChatWrapper(DashScopeWrapperBase):
-    """The model wrapper for DashScope's chat API."""
+    """The model wrapper for DashScope's chat API, refer to
+    https://help.aliyun.com/zh/dashscope/developer-reference/api-details
+    """
 
     model_type: str = "dashscope_chat"
 
@@ -164,8 +167,6 @@ def __call__(
                 "and 'content' key for DashScope API.",
             )
 
-        # TODO: move is to prompt engineering
-        messages = self._preprocess_role(messages)
         # step3: forward to generate response
         response = dashscope.Generation.call(
             model=self.model_name,
@@ -262,8 +263,6 @@ def format(
             `List[dict]`:
                 The formatted messages.
         """
-        # TODO: This function only convert agentscope msgs into qwen
-        #  messages, the re-range is executed in _preprocess_role function.
 
         # Parse all information into a list of messages
         input_msgs = []
@@ -314,37 +313,11 @@ def format(
 
         return messages
 
-    def _preprocess_role(self, messages: list) -> list:
-        """preprocess role rules for DashScope"""
-        # The models in this list require that the roles of messages must
-        # alternate between "user" and "assistant".
-        message_length = len(messages)
-        if message_length % 2 == 1:
-            # If the length of the message list is odd, roles will
-            # alternate, starting with "user"
-            roles = [
-                "user" if i % 2 == 0 else "assistant"
-                for i in range(message_length)
-            ]
-        else:
-            # If the length of the message list is even, the first role
-            # will be "system", followed by alternating "user" and
-            # "assistant"
-            roles = ["system"] + [
-                "user" if i % 2 == 1 else "assistant"
-                for i in range(1, message_length)
-            ]
-
-        # Assign the roles list to the "role" key for each message in
-        # the messages list
-        for message, role in zip(messages, roles):
-            message["role"] = role
-
-        return messages
-
 
 class DashScopeImageSynthesisWrapper(DashScopeWrapperBase):
-    """The model wrapper for DashScope Image Synthesis API."""
+    """The model wrapper for DashScope Image Synthesis API, refer to
+    https://help.aliyun.com/zh/dashscope/developer-reference/quick-start-1
+    """
 
     model_type: str = "dashscope_image_synthesis"
 
@@ -543,3 +516,335 @@ def __call__(
                 ],
                 raw=response,
             )
+
+
+class DashScopeMultiModalWrapper(DashScopeWrapperBase):
+    """The model wrapper for DashScope Multimodal API, refer to
+    https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-qianwen-vl-api
+    """
+
+    model_type: str = "dashscope_multimodal"
+
+    def _register_default_metrics(self) -> None:
+        # Set monitor accordingly
+        # TODO: set quota to the following metrics
+        self.monitor.register(
+            self._metric("call_counter"),
+            metric_unit="times",
+        )
+        self.monitor.register(
+            self._metric("prompt_tokens"),
+            metric_unit="token",
+        )
+        self.monitor.register(
+            self._metric("completion_tokens"),
+            metric_unit="token",
+        )
+        self.monitor.register(
+            self._metric("total_tokens"),
+            metric_unit="token",
+        )
+
+    def __call__(
+        self,
+        messages: list,
+        **kwargs: Any,
+    ) -> ModelResponse:
+        """Model call for DashScope MultiModal API.
+
+        Args:
+            messages (`list`):
+                A list of messages to process.
+            **kwargs (`Any`):
+                The keyword arguments to DashScope MultiModal API,
+                e.g. `stream`. Please refer to
+                https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-qianwen-vl-plus-api
+                for more detailed arguments.
+
+        Returns:
+            `ModelResponse`:
+                The response text in text field, and the raw response in
+                raw field.
+
+        Note:
+            If involving image links, then the messages should be of the
+            following form:
+
+            .. code-block:: python
+
+                messages = [
+                    {
+                        "role": "system",
+                        "content": [
+                            {"text": "You are a helpful assistant."},
+                        ],
+                    },
+                    {
+                        "role": "user",
+                        "content": [
+                            {"text": "What does this picture depict？"},
+                            {"image": "http://example.com/image.jpg"},
+                        ],
+                    },
+                ]
+
+            Therefore, you should input a list matching the content value
+            above.
+            If only involving words, just input them.
+
+            `parse_func`, `fault_handler` and `max_retries` are reserved
+            for `_response_parse_decorator` to parse and check the response
+            generated by model wrapper. Their usages are listed as follows:
+                - `parse_func` is a callable function used to parse and
+                check the response generated by the model, which takes the
+                response as input.
+                - `max_retries` is the maximum number of retries when the
+                `parse_func` raise an exception.
+                - `fault_handler` is a callable function which is called
+                when the response generated by the model is invalid after
+                `max_retries` retries.
+        """
+        # step1: prepare keyword arguments
+        kwargs = {**self.generate_args, **kwargs}
+
+        # step2: forward to generate response
+        response = dashscope.MultiModalConversation.call(
+            model=self.model_name,
+            messages=messages,
+            **kwargs,
+        )
+
+        if response.status_code != HTTPStatus.OK:
+            error_msg = (
+                f" Request id: {response.request_id},"
+                f" Status code: {response.status_code},"
+                f" error code: {response.code},"
+                f" error message: {response.message}."
+            )
+            raise RuntimeError(error_msg)
+
+        # step3: record the model api invocation if needed
+        self._save_model_invocation(
+            arguments={
+                "model": self.model_name,
+                "messages": messages,
+                **kwargs,
+            },
+            response=response,
+        )
+
+        # step4: update monitor accordingly
+        input_tokens = response.usage.get("input_tokens", 0)
+        image_tokens = response.usage.get("image_tokens", 0)
+        audio_tokens = response.usage.get("audio_tokens", 0)
+        output_tokens = response.usage.get("output_tokens", 0)
+        self.update_monitor(
+            call_counter=1,
+            prompt_tokens=input_tokens,
+            completion_tokens=output_tokens,
+            total_tokens=input_tokens
+            + output_tokens
+            + image_tokens
+            + audio_tokens,
+        )
+
+        # step5: return response
+        return ModelResponse(
+            text=response.output["choices"][0]["message"]["content"][0][
+                "text"
+            ],
+            raw=response,
+        )
+
+    def format(
+        self,
+        *args: Union[MessageBase, Sequence[MessageBase]],
+    ) -> List:
+        """Format the messages for DashScope Multimodal API.
+
+        The multimodal API has the following requirements:
+        - The roles of messages must alternate between "user" and
+        "assistant".
+        - The message with the role "system" should be the first message
+        in the list.
+            - If the system message exists, then the second message must
+            have the role "user".
+        - The last message in the list should have the role "user".
+        - In each message, more than one figure is allowed.
+
+        With the above requirements, we format the messages as follows:
+        - If the first message is a system message, then we will keep it as
+        system prompt.
+        - We merge all messages into a dialogue history prompt in a single
+        message with the role "user".
+        - When there are multiple figures in the given messages, we will
+        attach it to the user message by order. Note if there are multiple
+        figures, this strategy may cause misunderstanding for the model. For
+        advanced solutions, developers are encouraged to implement their own
+        prompt engineering strategies.
+
+        The following is an example:
+
+        .. code-block:: python
+
+            prompt = model.format(
+                Msg(
+                    "system",
+                    "You're a helpful assistant",
+                    role="system", url="figure1"
+                ),
+                Msg(
+                    "Bob",
+                    "How about this picture?",
+                    role="assistant", url="figure2"
+                ),
+                Msg(
+                    "user",
+                    "It's wonderful! How about mine?",
+                    role="user", image="figure3"
+                )
+            )
+
+        The prompt will be as follows:
+
+        .. code-block:: python
+
+            [
+                {
+                    "role": "system",
+                    "content": [
+                        {"text": "You are a helpful assistant"},
+                        {"image": "figure1"}
+                    ]
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"image": "figure2"},
+                        {"image": "figure3"},
+                        {
+                            "text": (
+                                "## Dialogue History\n"
+                                "Bob: How about this picture?\n"
+                                "user: It's wonderful! How about mine?"
+                            )
+                        },
+                    ]
+                }
+            ]
+
+        Note:
+            In multimodal API, the url of local files should be prefixed with
+            "file://", which will be attached in this format function.
+
+        Args:
+            args (`Union[MessageBase, Sequence[MessageBase]]`):
+                The input arguments to be formatted, where each argument
+                should be a `Msg` object, or a list of `Msg` objects.
+                In distribution, placeholder is also allowed.
+
+        Returns:
+            `List[dict]`:
+                The formatted messages.
+        """
+
+        # Parse all information into a list of messages
+        input_msgs = []
+        for _ in args:
+            if isinstance(_, MessageBase):
+                input_msgs.append(_)
+            elif isinstance(_, list) and all(
+                isinstance(__, MessageBase) for __ in _
+            ):
+                input_msgs.extend(_)
+            else:
+                raise TypeError(
+                    f"The input should be a Msg object or a list "
+                    f"of Msg objects, got {type(_)}.",
+                )
+
+        messages = []
+
+        # record dialog history as a list of strings
+        dialogue = []
+        image_or_audio_dicts = []
+        for i, unit in enumerate(input_msgs):
+            if i == 0 and unit.role == "system":
+                # system prompt
+                content = self._convert_url(unit.url)
+                content.append({"text": _convert_to_str(unit.content)})
+
+                messages.append(
+                    {
+                        "role": unit.role,
+                        "content": content,
+                    },
+                )
+            else:
+                # text message
+                dialogue.append(
+                    f"{unit.name}: {_convert_to_str(unit.content)}",
+                )
+                # image and audio
+                image_or_audio_dicts.extend(self._convert_url(unit.url))
+
+        dialogue_history = "\n".join(dialogue)
+
+        user_content_template = "## Dialogue History\n{dialogue_history}"
+
+        messages.append(
+            {
+                "role": "user",
+                "content": [
+                    # Place the image or audio before the dialogue history
+                    *image_or_audio_dicts,
+                    {
+                        "text": user_content_template.format(
+                            dialogue_history=dialogue_history,
+                        ),
+                    },
+                ],
+            },
+        )
+
+        return messages
+
+    def _convert_url(self, url: Union[str, Sequence[str], None]) -> List[dict]:
+        """Convert the url to the format of DashScope API. Note for local
+        files, a prefix "file://" will be added.
+
+        Args:
+            url (`Union[str, Sequence[str], None]`):
+                A string of url of a list of urls to be converted.
+
+        Returns:
+            `List[dict]`:
+                A list of dictionaries with key as the type of the url
+                and value as the url. Only "image" and "audio" are supported.
+        """
+        if url is None:
+            return []
+
+        if isinstance(url, str):
+            url_type = _guess_type_by_extension(url)
+            if url_type in ["audio", "image"]:
+                # Add prefix for local files
+                if os.path.exists(url):
+                    url = "file://" + url
+                return [{url_type: url}]
+            else:
+                # skip unsupported url
+                logger.warning(
+                    f"Skip unsupported url ({url_type}), "
+                    f"expect image or audio.",
+                )
+                return []
+        elif isinstance(url, list):
+            dicts = []
+            for _ in url:
+                dicts.extend(self._convert_url(_))
+            return dicts
+        else:
+            raise TypeError(
+                f"Unsupported url type {type(url)}, " f"str or list expected.",
+            )
diff --git a/src/agentscope/utils/tools.py b/src/agentscope/utils/tools.py
index 9a00d8201..47c18e5d3 100644
--- a/src/agentscope/utils/tools.py
+++ b/src/agentscope/utils/tools.py
@@ -5,7 +5,7 @@
 import json
 import secrets
 import string
-from typing import Any
+from typing import Any, Literal
 
 from urllib.parse import urlparse
 
@@ -60,6 +60,64 @@ def to_dialog_str(item: dict) -> str:
         return f"{speaker}: {content}"
 
 
+def _guess_type_by_extension(
+    url: str,
+) -> Literal["image", "audio", "video", "file"]:
+    """Guess the type of the file by its extension."""
+    extension = url.split(".")[-1].lower()
+
+    if extension in [
+        "bmp",
+        "dib",
+        "icns",
+        "ico",
+        "jfif",
+        "jpe",
+        "jpeg",
+        "jpg",
+        "j2c",
+        "j2k",
+        "jp2",
+        "jpc",
+        "jpf",
+        "jpx",
+        "apng",
+        "png",
+        "bw",
+        "rgb",
+        "rgba",
+        "sgi",
+        "tif",
+        "tiff",
+        "webp",
+    ]:
+        return "image"
+    elif extension in [
+        "amr",
+        "wav",
+        "3gp",
+        "3gpp",
+        "aac",
+        "mp3",
+        "flac",
+        "ogg",
+    ]:
+        return "audio"
+    elif extension in [
+        "mp4",
+        "webm",
+        "mkv",
+        "flv",
+        "avi",
+        "mov",
+        "wmv",
+        "rmvb",
+    ]:
+        return "video"
+    else:
+        return "file"
+
+
 def _to_openai_image_url(url: str) -> str:
     """Convert an image url to openai format. If the given url is a local
     file, it will be converted to base64 format. Otherwise, it will be
diff --git a/tests/dashscope_test.py b/tests/dashscope_test.py
index 8164e21c8..0a6bec7d3 100644
--- a/tests/dashscope_test.py
+++ b/tests/dashscope_test.py
@@ -3,11 +3,12 @@
 import unittest
 from unittest.mock import patch, MagicMock
 
-from agentscope.models import (  # pylint: disable=no-name-in-module
+from agentscope.models import (
     ModelResponse,
     DashScopeChatWrapper,
     DashScopeImageSynthesisWrapper,
     DashScopeTextEmbeddingWrapper,
+    DashScopeMultiModalWrapper,
 )
 
 
@@ -252,5 +253,98 @@ def test_call_failure(self, mock_call: MagicMock) -> None:
         )
 
 
+class TestDashScopeMultiModalWrapper(unittest.TestCase):
+    """Test DashScope MultiModal Wrapper"""
+
+    def setUp(self) -> None:
+        # Initialize DashScopeMultiModalWrapper instance
+        self.wrapper = DashScopeMultiModalWrapper(
+            config_name="test_config",
+            model_name="test_model",
+            api_key="test_key",
+        )
+
+    @patch(
+        "agentscope.models.dashscope_model."
+        "dashscope.MultiModalConversation.call",
+    )
+    def test_call_success(self, mock_call: MagicMock) -> None:
+        """Test call success"""
+        # Mocking the response from the API
+        mock_response = MagicMock()
+        mock_response.status_code = 200
+        mock_response.output = {
+            "choices": [
+                {"message": {"content": [{"text": "This is the result."}]}},
+            ],
+        }
+        mock_response.usage = {
+            "input_tokens": 23,
+            "output_tokens": 5,
+            "image_tokens": 17,
+        }
+        mock_call.return_value = mock_response
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"text": "What does this picture depict?"},
+                    {"image": "http://example.com/image.jpg"},
+                ],
+            },
+        ]
+        # Calling the wrapper and validating the response
+        response = self.wrapper(messages=messages)
+        self.assertIsInstance(response, ModelResponse)
+        self.assertEqual(response.text, "This is the result.")
+        self.assertEqual(response.raw, mock_response)
+
+        # Verify call to dashscope.MultiModalConversation.call
+        mock_call.assert_called_once_with(
+            model=self.wrapper.model_name,
+            messages=messages,
+        )
+
+    @patch(
+        "agentscope.models.dashscope_model."
+        "dashscope.MultiModalConversation.call",
+    )
+    def test_call_failure(self, mock_call: MagicMock) -> None:
+        """Test call failure"""
+        # Simulating a failed API call
+        mock_response = MagicMock()
+        mock_response.status_code = 400
+        mock_response.request_id = "Test_request_id"
+        mock_response.code = "Error Code"
+        mock_response.message = "Error Message"
+        mock_call.return_value = mock_response
+
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"text": "What does this picture depict?"},
+                    {"image": "http://example.com/image.jpg"},
+                ],
+            },
+        ]
+        # Expecting a RuntimeError to be raised
+        with self.assertRaises(RuntimeError) as context:
+            self.wrapper(messages=messages)
+
+        # Assert the expected exception message
+        self.assertIn("Error Code", str(context.exception))
+        self.assertIn("Error Message", str(context.exception))
+        self.assertIn("Test_request_id", str(context.exception))
+        self.assertIn("400", str(context.exception))
+
+        # Verify call to dashscope.MultiModalConversation.call
+        mock_call.assert_called_once_with(
+            model=self.wrapper.model_name,
+            messages=messages,
+        )
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/format_test.py b/tests/format_test.py
index 452c1cdd2..589398837 100644
--- a/tests/format_test.py
+++ b/tests/format_test.py
@@ -10,6 +10,7 @@
     OllamaGenerationWrapper,
     GeminiChatWrapper,
     DashScopeChatWrapper,
+    DashScopeMultiModalWrapper,
 )
 
 
@@ -172,6 +173,132 @@ def test_dashscope_chat(self) -> None:
         with self.assertRaises(TypeError):
             model.format(*self.wrong_inputs)  # type: ignore[arg-type]
 
+    def test_dashscope_multimodal_image(self) -> None:
+        """Unit test for the format function in dashscope multimodal
+        conversation api wrapper for image."""
+        model = DashScopeMultiModalWrapper(
+            config_name="",
+            model_name="qwen-vl-plus",
+            api_key="xxx",
+        )
+
+        multimodal_input = [
+            Msg(
+                "system",
+                "You are a helpful assistant",
+                role="system",
+                url="url1.png",
+            ),
+            [
+                Msg(
+                    "user",
+                    "What is the weather today?",
+                    role="user",
+                    url="url2.png",
+                ),
+                Msg(
+                    "assistant",
+                    "It is sunny today",
+                    role="assistant",
+                    url="url3.png",
+                ),
+            ],
+        ]
+
+        ground_truth = [
+            {
+                "role": "system",
+                "content": [
+                    {"image": "url1.png"},
+                    {"text": "You are a helpful assistant"},
+                ],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"image": "url2.png"},
+                    {"image": "url3.png"},
+                    {
+                        "text": (
+                            "## Dialogue History\n"
+                            "user: What is the weather today?\n"
+                            "assistant: It is sunny today"
+                        ),
+                    },
+                ],
+            },
+        ]
+
+        prompt = model.format(*multimodal_input)
+        self.assertListEqual(prompt, ground_truth)
+
+        # wrong format
+        with self.assertRaises(TypeError):
+            model.format(*self.wrong_inputs)
+
+    def test_dashscope_multimodal_audio(self) -> None:
+        """Unit test for the format function in dashscope multimodal
+        conversation api wrapper for audio."""
+        model = DashScopeMultiModalWrapper(
+            config_name="",
+            model_name="qwen-audio-turbo",
+            api_key="xxx",
+        )
+
+        multimodal_input = [
+            Msg(
+                "system",
+                "You are a helpful assistant",
+                role="system",
+                url="url1.mp3",
+            ),
+            [
+                Msg(
+                    "user",
+                    "What is the weather today?",
+                    role="user",
+                    url="url2.mp3",
+                ),
+                Msg(
+                    "assistant",
+                    "It is sunny today",
+                    role="assistant",
+                    url="url3.mp3",
+                ),
+            ],
+        ]
+
+        ground_truth = [
+            {
+                "role": "system",
+                "content": [
+                    {"audio": "url1.mp3"},
+                    {"text": "You are a helpful assistant"},
+                ],
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"audio": "url2.mp3"},
+                    {"audio": "url3.mp3"},
+                    {
+                        "text": (
+                            "## Dialogue History\n"
+                            "user: What is the weather today?\n"
+                            "assistant: It is sunny today"
+                        ),
+                    },
+                ],
+            },
+        ]
+
+        prompt = model.format(*multimodal_input)
+        self.assertListEqual(prompt, ground_truth)
+
+        # wrong format
+        with self.assertRaises(TypeError):
+            model.format(*self.wrong_inputs)
+
 
 if __name__ == "__main__":
     unittest.main()