diff --git a/README.md b/README.md
index 1358856ae..5612a8034 100644
--- a/README.md
+++ b/README.md
@@ -65,20 +65,21 @@ applications in a centralized programming manner for streamlined development.
AgentScope provides a list of `ModelWrapper` to support both local model
services and third-party model APIs.
-| API | Task | Model Wrapper | Example Configuration | Some Supported Models |
-|------------------------|-----------------|---------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------|-------------------------------------------|
-| OpenAI API | Chat | [`OpenAIChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api) | gpt-4, gpt-3.5-turbo, ... |
-| | Embedding | [`OpenAIEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api) | text-embedding-ada-002, ... |
-| | DALL·E | [`OpenAIDALLEWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api) | dall-e-2, dall-e-3 |
-| DashScope API | Chat | [`DashScopeChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api) | qwen-plus, qwen-max, ... |
-| | Image Synthesis | [`DashScopeImageSynthesisWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api) | wanx-v1 |
-| | Text Embedding | [`DashScopeTextEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api) | text-embedding-v1, text-embedding-v2, ... |
-| Gemini API | Chat | [`GeminiChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#gemini-api) | gemini-pro, ... |
-| | Embedding | [`GeminiEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#gemini-api) | models/embedding-001, ... |
-| ollama | Chat | [`OllamaChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api) | llama2, Mistral, ... |
-| | Embedding | [`OllamaEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api) | llama2, Mistral, ... |
-| | Generation | [`OllamaGenerationWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api) | llama2, Mistral, ... |
-| Post Request based API | - | [`PostAPIModelWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/post_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#post-request-api) | - |
+| API | Task | Model Wrapper | Example Configuration | Some Supported Models |
+|------------------------|-----------------|---------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------|----------------------------------------------|
+| OpenAI API | Chat | [`OpenAIChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api) | gpt-4, gpt-3.5-turbo, ... |
+| | Embedding | [`OpenAIEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api) | text-embedding-ada-002, ... |
+| | DALL·E | [`OpenAIDALLEWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api) | dall-e-2, dall-e-3 |
+| DashScope API | Chat | [`DashScopeChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api) | qwen-plus, qwen-max, ... |
+| | Image Synthesis | [`DashScopeImageSynthesisWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api) | wanx-v1 |
+| | Text Embedding | [`DashScopeTextEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api) | text-embedding-v1, text-embedding-v2, ... |
+| | Multimodal | [`DashScopeMultiModalWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api) | qwen-vl-v1, qwen-vl-chat-v1, qwen-audio-chat |
+| Gemini API | Chat | [`GeminiChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#gemini-api) | gemini-pro, ... |
+| | Embedding | [`GeminiEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#gemini-api) | models/embedding-001, ... |
+| ollama | Chat | [`OllamaChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api) | llama2, Mistral, ... |
+| | Embedding | [`OllamaEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api) | llama2, Mistral, ... |
+| | Generation | [`OllamaGenerationWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api) | llama2, Mistral, ... |
+| Post Request based API | - | [`PostAPIModelWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/post_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#post-request-api) | - |
**Supported Local Model Deployment**
diff --git a/README_ZH.md b/README_ZH.md
index 745450e22..979dc246d 100644
--- a/README_ZH.md
+++ b/README_ZH.md
@@ -54,20 +54,21 @@ AgentScope是一个创新的多智能体开发平台,旨在赋予开发人员
AgentScope提供了一系列`ModelWrapper`来支持本地模型服务和第三方模型API。
-| API | Task | Model Wrapper | Example Configuration | Some Supported Models |
-|------------------------|-----------------|---------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------|-------------------------------------------|
-| OpenAI API | Chat | [`OpenAIChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api) | gpt-4, gpt-3.5-turbo, ... |
-| | Embedding | [`OpenAIEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api) | text-embedding-ada-002, ... |
-| | DALL·E | [`OpenAIDALLEWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api) | dall-e-2, dall-e-3 |
-| DashScope API | Chat | [`DashScopeChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api) | qwen-plus, qwen-max, ... |
-| | Image Synthesis | [`DashScopeImageSynthesisWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api) | wanx-v1 |
-| | Text Embedding | [`DashScopeTextEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api) | text-embedding-v1, text-embedding-v2, ... |
-| Gemini API | Chat | [`GeminiChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#gemini-api) | gemini-pro, ... |
-| | Embedding | [`GeminiEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#gemini-api) | models/embedding-001, ... |
-| ollama | Chat | [`OllamaChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api) | llama2, Mistral, ... |
-| | Embedding | [`OllamaEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api) | llama2, Mistral, ... |
-| | Generation | [`OllamaGenerationWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api) | llama2, Mistral, ... |
-| Post Request based API | - | [`PostAPIModelWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/post_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#post-request-api) | - |
+| API | Task | Model Wrapper | Example Configuration | Some Supported Models |
+|------------------------|-----------------|---------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------|----------------------------------------------|
+| OpenAI API | Chat | [`OpenAIChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api) | gpt-4, gpt-3.5-turbo, ... |
+| | Embedding | [`OpenAIEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api) | text-embedding-ada-002, ... |
+| | DALL·E | [`OpenAIDALLEWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#openai-api) | dall-e-2, dall-e-3 |
+| DashScope API | Chat | [`DashScopeChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api) | qwen-plus, qwen-max, ... |
+| | Image Synthesis | [`DashScopeImageSynthesisWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api) | wanx-v1 |
+| | Text Embedding | [`DashScopeTextEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api) | text-embedding-v1, text-embedding-v2, ... |
+| | Multimodal | [`DashScopeMultiModalWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#dashscope-api) | qwen-vl-v1, qwen-vl-chat-v1, qwen-audio-chat |
+| Gemini API | Chat | [`GeminiChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#gemini-api) | gemini-pro, ... |
+| | Embedding | [`GeminiEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#gemini-api) | models/embedding-001, ... |
+| ollama | Chat | [`OllamaChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api) | llama2, Mistral, ... |
+| | Embedding | [`OllamaEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api) | llama2, Mistral, ... |
+| | Generation | [`OllamaGenerationWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#ollama-api) | llama2, Mistral, ... |
+| Post Request based API | - | [`PostAPIModelWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/post_model.py) | [link](https://modelscope.github.io/agentscope/en/tutorial/203-model.html#post-request-api) | - |
**支持的本地模型部署**
diff --git a/docs/sphinx_doc/en/source/tutorial/203-model.md b/docs/sphinx_doc/en/source/tutorial/203-model.md
index 372ce5358..5c3cb0b7e 100644
--- a/docs/sphinx_doc/en/source/tutorial/203-model.md
+++ b/docs/sphinx_doc/en/source/tutorial/203-model.md
@@ -70,20 +70,21 @@ class OpenAIChatWrapper(OpenAIWrapperBase):
In the current AgentScope, the supported `model_type` types, the corresponding
`ModelWrapper` classes, and the supported APIs are as follows:
-| API | Task | Model Wrapper | `model_type` |
-|------------------------|-----------------|---------------------------------------------------------------------------------------------------------------------------------|-------------------------------|
-| OpenAI API | Chat | [`OpenAIChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | `"openai"` |
-| | Embedding | [`OpenAIEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | `"openai_embedding"` |
-| | DALL·E | [`OpenAIDALLEWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | `"openai_dall_e"` |
-| DashScope API | Chat | [`DashScopeChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_chat"` |
-| | Image Synthesis | [`DashScopeImageSynthesisWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_image_synthesis"` |
-| | Text Embedding | [`DashScopeTextEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_text_embedding"` |
-| Gemini API | Chat | [`GeminiChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py) | `"gemini_chat"` |
-| | Embedding | [`GeminiEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py) | `"gemini_embedding"` |
-| ollama | Chat | [`OllamaChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | `"ollama_chat"` |
-| | Embedding | [`OllamaEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | `"ollama_embedding"` |
-| | Generation | [`OllamaGenerationWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | `"ollama_generate"` |
-| Post Request based API | - | [`PostAPIModelWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/post_model.py) | `"post_api"` |
+| API | Task | Model Wrapper | `model_type` | Some Supported Models |
+|------------------------|-----------------|---------------------------------------------------------------------------------------------------------------------------------|-------------------------------|--------------------------------------------------|
+| OpenAI API | Chat | [`OpenAIChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | `"openai"` | gpt-4, gpt-3.5-turbo, ... |
+| | Embedding | [`OpenAIEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | `"openai_embedding"` | text-embedding-ada-002, ... |
+| | DALL·E | [`OpenAIDALLEWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | `"openai_dall_e"` | dall-e-2, dall-e-3 |
+| DashScope API | Chat | [`DashScopeChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_chat"` | qwen-plus, qwen-max, ... |
+| | Image Synthesis | [`DashScopeImageSynthesisWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_image_synthesis"` | wanx-v1 |
+| | Text Embedding | [`DashScopeTextEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_text_embedding"` | text-embedding-v1, text-embedding-v2, ... |
+| | Multimodal | [`DashScopeMultiModalWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_multimodal"` | qwen-vl-plus, qwen-vl-max, qwen-audio-turbo, ... |
+| Gemini API | Chat | [`GeminiChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py) | `"gemini_chat"` | gemini-pro, ... |
+| | Embedding | [`GeminiEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py) | `"gemini_embedding"` | models/embedding-001, ... |
+| ollama | Chat | [`OllamaChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | `"ollama_chat"` | llama2, ... |
+| | Embedding | [`OllamaEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | `"ollama_embedding"` | llama2, ... |
+| | Generation | [`OllamaGenerationWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | `"ollama_generate"` | llama2, ... |
+| Post Request based API | - | [`PostAPIModelWrapperBase`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/post_model.py) | `"post_api"` | - |
#### Detailed Parameters
@@ -238,6 +239,27 @@ openai_chat_config = {
+
+DashScope Multimodal Conversation API (agentscope.models.DashScopeMultiModalWrapper
)
+
+```python
+{
+ "config_name": "my_dashscope_multimodal_config",
+ "model_type": "dashscope_multimodal",
+
+ # Required parameters
+ "model_name": "{model_name}", # The model name in DashScope Multimodal Conversation API, e.g. qwen-vl-plus
+
+ # Optional parameters
+ "api_key": "{your_api_key}",
+ "generate_args": {
+ # ...
+ },
+}
+```
+
+
+
#### Gemini API
diff --git a/docs/sphinx_doc/en/source/tutorial/206-prompt.md b/docs/sphinx_doc/en/source/tutorial/206-prompt.md
index 77ee006c7..43a6e18fc 100644
--- a/docs/sphinx_doc/en/source/tutorial/206-prompt.md
+++ b/docs/sphinx_doc/en/source/tutorial/206-prompt.md
@@ -38,17 +38,21 @@ following built-in strategies for most chat and generation related model APIs.
In AgentScope, we provide built-in strategies for the following chat and
generation model APIs.
-- [`OpenAIChatWrapper`](#openaichatwrapper)
-- [`DashScopeChatWrapper`](#dashscopechatwrapper)
-- [`OllamaChatWrapper`](#ollamachatwrapper)
-- [`OllamaGenerationWrapper`](ollamagenerationwrapper)
-- [`GeminiChatWrapper`](#geminiwrapper)
+- [OpenAIChatWrapper](#openaichatwrapper)
+- [DashScopeChatWrapper](#dashscopechatwrapper)
+- [DashScopeMultiModalWrapper](#dashscopemultimodalwrapper)
+- [OllamaChatWrapper](#ollamachatwrapper)
+- [OllamaGenerationWrapper](#ollamagenerationwrapper)
+- [GeminiChatWrapper](#geminichatwrapper)
These strategies are implemented in the `format` functions of the model
wrapper classes.
It accepts `Msg` objects, a list of `Msg` objects, or their mixture as input.
+However, `format` function will first reorganize them into a list of `Msg`
+objects, so for simplicity in the following sections we treat the input as a
+list of `Msg` objects.
-### `OpenAIChatWrapper`
+### OpenAIChatWrapper
`OpenAIChatWrapper` encapsulates the OpenAI chat API, it takes a list of
dictionaries as input, where the dictionary must obey the following rules
@@ -95,7 +99,7 @@ print(prompt)
]
```
-### `DashScopeChatWrapper`
+### DashScopeChatWrapper
`DashScopeChatWrapper` encapsulates the DashScope chat API, which takes a list of messages as input. The message must obey the following rules (updated in 2024/03/22):
@@ -138,7 +142,91 @@ print(prompt)
]
```
-### `OllamaChatWrapper`
+### DashScopeMultiModalWrapper
+
+`DashScopeMultiModalWrapper` encapsulates the DashScope multimodal conversation API, which takes a list of messages as input. The message must obey the following rules (updated in 2024/04/04):
+
+- Each message is a dictionary with `role` and `content` fields.
+ - The `role` field must be either `"user"`, `"system"`, or `"assistant"`.
+ - The `content` field must be a list of dictionaries, where
+ - Each dictionary only contains one key-value pair, whose key must be `text`, `image` or `audio`.
+ - `text` field is a string, representing the text content.
+ - `image` field is a string, representing the image url.
+ - `audio` field is a string, representing the audio url.
+ - The `content` field can contain multiple dictionaries with the key `image` or multiple dictionaries with the key `audio` at the same time. For example:
+```python
+[
+ {
+ "role": "user",
+ "content": [
+ {"text": "What's the difference between these two pictures?"},
+ {"image": "https://xxx1.png"},
+ {"image": "https://xxx2.png"}
+ ]
+ },
+ {
+ "role": "assistant",
+ "content": [{"text": "The first picture is a cat, and the second picture is a dog."}]
+ },
+ {
+ "role": "user",
+ "content": [{"text": "I see, thanks!"}]
+ }
+]
+```
+- The message with the `role` field as `"system"` must and can only be the first message in the list.
+- The last message must have the `role` field as `"user"`.
+- The `user` and `assistant` messages must alternate.
+
+#### Prompt Strategy
+
+Based on the above rules, the `format` function in `DashScopeMultiModalWrapper` will parse the input messages as follows:
+
+- If the first message in the input message list has a `role` field with the value `"system"`, it will be converted into a system message with the `role` field as `"system"` and the `content` field as the system message. If the `url` field in the input `Msg` object is not `None`, a dictionary with the key `"image"` or `"audio"` will be added to the `content` based on its type.
+- The rest of the messages will be converted into a message with the `role` field as `"user"` and the `content` field as the dialogue history. For each message, if their `url` field is not `None`, it will add a dictionary with the key `"image"` or `"audio"` to the `content` based on the file type that the `url` points to.
+
+An example:
+
+```python
+from agentscope.models import DashScopeMultiModalWrapper
+from agentscope.message import Msg
+
+model = DashScopeMultiModalWrapper(
+ config_name="", # empty since we directly initialize the model wrapper
+ model_name="qwen-vl-plus",
+)
+
+prompt = model.format(
+ Msg("system", "You're a helpful assistant", role="system", url="url_to_png1"), # Msg object
+ [ # a list of Msg objects
+ Msg(name="Bob", content="Hi!", role="assistant", url="url_to_png2"),
+ Msg(name="Alice", content="Nice to meet you!", role="assistant", url="url_to_png3"),
+ ],
+)
+print(prompt)
+```
+
+```bash
+[
+ {
+ "role": "system",
+ "content": [
+ {"text": "You are a helpful assistant"},
+ {"image": "url_to_png1"}
+ ]
+ },
+ {
+ "role": "user",
+ "content": [
+ {"text": "## Dialogue History\nBob: Hi!\nAlice: Nice to meet you!"},
+ {"image": "url_to_png2"},
+ {"image": "url_to_png3"},
+ ]
+ }
+]
+```
+
+### OllamaChatWrapper
`OllamaChatWrapper` encapsulates the Ollama chat API, which takes a list of
messages as input. The message must obey the following rules (updated in
@@ -183,7 +271,7 @@ print(prompt)
]
```
-### `OllamaGenerationWrapper`
+### OllamaGenerationWrapper
`OllamaGenerationWrapper` encapsulates the Ollama generation API, which
takes a string prompt as input without any constraints (updated to 2024/03/22).
diff --git a/docs/sphinx_doc/zh_CN/source/tutorial/203-model.md b/docs/sphinx_doc/zh_CN/source/tutorial/203-model.md
index fc30bc076..99c4dec93 100644
--- a/docs/sphinx_doc/zh_CN/source/tutorial/203-model.md
+++ b/docs/sphinx_doc/zh_CN/source/tutorial/203-model.md
@@ -90,20 +90,21 @@ class OpenAIChatWrapper(OpenAIWrapper):
在目前的AgentScope中,所支持的`model_type`类型,对应的`ModelWrapper`类,以及支持的
API如下:
-| API | Task | Model Wrapper | `model_type` |
-|------------------------|-----------------|---------------------------------------------------------------------------------------------------------------------------------|-------------------------------|
-| OpenAI API | Chat | [`OpenAIChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | `"openai"` |
-| | Embedding | [`OpenAIEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | `"openai_embedding"` |
-| | DALL·E | [`OpenAIDALLEWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | `"openai_dall_e"` |
-| DashScope API | Chat | [`DashScopeChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_chat"` |
-| | Image Synthesis | [`DashScopeImageSynthesisWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_image_synthesis"` |
-| | Text Embedding | [`DashScopeTextEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_text_embedding"` |
-| Gemini API | Chat | [`GeminiChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py) | `"gemini_chat"` |
-| | Embedding | [`GeminiEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py) | `"gemini_embedding"` |
-| ollama | Chat | [`OllamaChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | `"ollama_chat"` |
-| | Embedding | [`OllamaEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | `"ollama_embedding"` |
-| | Generation | [`OllamaGenerationWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | `"ollama_generate"` |
-| Post Request based API | - | [`PostAPIModelWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/post_model.py) | `"post_api"` |
+| API | Task | Model Wrapper | `model_type` | Some Supported Models |
+|------------------------|-----------------|---------------------------------------------------------------------------------------------------------------------------------|-------------------------------|--------------------------------------------------|
+| OpenAI API | Chat | [`OpenAIChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | `"openai"` | gpt-4, gpt-3.5-turbo, ... |
+| | Embedding | [`OpenAIEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | `"openai_embedding"` | text-embedding-ada-002, ... |
+| | DALL·E | [`OpenAIDALLEWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/openai_model.py) | `"openai_dall_e"` | dall-e-2, dall-e-3 |
+| DashScope API | Chat | [`DashScopeChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_chat"` | qwen-plus, qwen-max, ... |
+| | Image Synthesis | [`DashScopeImageSynthesisWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_image_synthesis"` | wanx-v1 |
+| | Text Embedding | [`DashScopeTextEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_text_embedding"` | text-embedding-v1, text-embedding-v2, ... |
+| | Multimodal | [`DashScopeMultiModalWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/dashscope_model.py) | `"dashscope_multimodal"` | qwen-vl-plus, qwen-vl-max, qwen-audio-turbo, ... |
+| Gemini API | Chat | [`GeminiChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py) | `"gemini_chat"` | gemini-pro, ... |
+| | Embedding | [`GeminiEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/gemini_model.py) | `"gemini_embedding"` | models/embedding-001, ... |
+| ollama | Chat | [`OllamaChatWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | `"ollama_chat"` | llama2, ... |
+| | Embedding | [`OllamaEmbeddingWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | `"ollama_embedding"` | llama2, ... |
+| | Generation | [`OllamaGenerationWrapper`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/ollama_model.py) | `"ollama_generate"` | llama2, ... |
+| Post Request based API | - | [`PostAPIModelWrapperBase`](https://github.com/modelscope/agentscope/blob/main/src/agentscope/models/post_model.py) | `"post_api"` | - |
#### 详细参数
@@ -258,6 +259,27 @@ openai_chat_config = {
+
+DashScope Multimodal Conversation API (agentscope.models.DashScopeMultiModalWrapper
)
+
+```python
+{
+ "config_name": "my_dashscope_multimodal_config",
+ "model_type": "dashscope_multimodal",
+
+ # Required parameters
+ "model_name": "{model_name}", # The model name in DashScope Multimodal Conversation API, e.g. qwen-vl-plus
+
+ # Optional parameters
+ "api_key": "{your_api_key}",
+ "generate_args": {
+ # ...
+ },
+}
+```
+
+
+
#### Gemini API
@@ -417,8 +439,8 @@ AgentScope允许开发者自定义自己的模型包装器。新的模型包装
```python
from agentscope.models import ModelWrapperBase
-class MyModelWrapper(ModelWrapperBase):
+class MyModelWrapper(ModelWrapperBase):
model_type: str = "my_model"
def __init__(self, config_name, my_arg1, my_arg2, **kwargs):
diff --git a/docs/sphinx_doc/zh_CN/source/tutorial/206-prompt.md b/docs/sphinx_doc/zh_CN/source/tutorial/206-prompt.md
index 7888ee548..1bd05ad4e 100644
--- a/docs/sphinx_doc/zh_CN/source/tutorial/206-prompt.md
+++ b/docs/sphinx_doc/zh_CN/source/tutorial/206-prompt.md
@@ -23,13 +23,14 @@ AgentScope内置策略的目标是**使初学者能够顺利调用模型API ,
AgentScope为以下的模型API提供了内置的提示构建策略。
-- [`OpenAIChatWrapper`](#openaichatwrapper)
-- [`DashScopeChatWrapper`](#dashscopechatwrapper)
-- [`OllamaChatWrapper`](#ollamachatwrapper)
-- [`OllamaGenerationWrapper`](ollamagenerationwrapper)
-- [`GeminiChatWrapper`](#geminiwrapper)
+- [OpenAIChatWrapper](#openaichatwrapper)
+- [DashScopeChatWrapper](#dashscopechatwrapper)
+- [DashScopeMultiModalWrapper](#dashscopemultimodalwrapper)
+- [OllamaChatWrapper](#ollamachatwrapper)
+- [OllamaGenerationWrapper](#ollamagenerationwrapper)
+- [GeminiChatWrapper](#geminichatwrapper)
-这些策略是在对应Model Wrapper类的`format`函数中实现的。它接受`Msg`对象,`Msg`对象的列表或它们的混合作为输入。
+这些策略是在对应Model Wrapper类的`format`函数中实现的。它接受`Msg`对象,`Msg`对象的列表或它们的混合作为输入。在`format`函数将会把输入重新组织成一个`Msg`对象的列表,因此为了方便解释,我们在下面的章节中认为`format`函数的输入是`Msg`对象的列表。
### `OpenAIChatWrapper`
@@ -115,6 +116,89 @@ print(prompt)
]
```
+### `DashScopeMultiModalWrapper`
+
+`DashScopeMultiModalWrapper`封装了多模态模型的API,它接受消息列表作为输入,并且必须遵循以下的规则(更新于2024/04/04):
+
+- 每个消息是一个字段,并且包含`role`和`content`字段。
+ - 其中`role`字段取值必须是`"user"`,`"system"`,`"assistant"`之一。
+ - `content`字段对应的值必须是字典的列表
+ - 每个字典只包含`text`,`image`或`audio`中的一个键值对
+ - `text`域对应的值是一个字符串,表示文本内容
+ - `image`域对应的值是一个字符串,表示图片的url
+ - `audio`域对应的值是一个字符串,表示音频的url
+ - `content`中可以同时包含多个key为`image`的字典或者多个key为`audio`的字典。例如
+```python
+[
+ {
+ "role": "user",
+ "content": [
+ {"text": "What's the difference between these two pictures?"},
+ {"image": "https://xxx1.png"},
+ {"image": "https://xxx2.png"}
+ ]
+ },
+ {
+ "role": "assistant",
+ "content": [{"text": "The first picture is a cat, and the second picture is a dog."}]
+ },
+ {
+ "role": "user",
+ "content": [{"text": "I see, thanks!"}]
+ }
+]
+```
+- 如果一条信息的`role`字段是`"system"`,那么这条信息必须也只能出现在消息列表的开头。
+- 消息列表中最后一条消息的`role`字段必须是`"user"`。
+- 消息列表中`user`和`assistant`必须交替发言。
+
+#### 提示的构建策略
+
+基于上述API的限制,构建策略如下:
+- 如果输入的消息列表中第一条消息的`role`字段的值是`"system"`,它将被转换为一条系统消息,其中`role`字段为`"system"`,`content`字段为系统消息,如果输入`Msg`对象中`url`属性不为`None`,则根据其类型在`content`中增加一个键值为`"image"`或者`"audio"`的字典。
+- 其余的消息将被转换为一条消息,其中`role`字段为`"user"`,`content`字段为对话历史。并且所有`Msg`对象中`url`属性不为`None`的消息,都会根据`url`指向的文件类型在`content`中增加一个键值为`"image"`或者`"audio"`的字典。
+
+样例如下:
+
+```python
+from agentscope.models import DashScopeMultiModalWrapper
+from agentscope.message import Msg
+
+model = DashScopeMultiModalWrapper(
+ config_name="", # 我们直接初始化model wrapper,因此不需要填入config_name
+ model_name="qwen-vl-plus",
+)
+
+prompt = model.format(
+ Msg("system", "You're a helpful assistant", role="system", url="url_to_png1"), # Msg对象
+ [ # Msg对象的列表
+ Msg(name="Bob", content="Hi!", role="assistant", url="url_to_png2"),
+ Msg(name="Alice", content="Nice to meet you!", role="assistant", url="url_to_png3"),
+ ],
+)
+print(prompt)
+```
+
+```bash
+[
+ {
+ "role": "system",
+ "content": [
+ {"text": "You are a helpful assistant"},
+ {"image": "url_to_png1"}
+ ]
+ },
+ {
+ "role": "user",
+ "content": [
+ {"text": "## Dialogue History\nBob: Hi!\nAlice: Nice to meet you!"},
+ {"image": "url_to_png2"},
+ {"image": "url_to_png3"},
+ ]
+ }
+]
+```
+
### `OllamaChatWrapper`
`OllamaChatWrapper`封装了Ollama聊天API,它接受消息列表作为输入。消息必须遵守以下规则(更新于2024/03/22):
diff --git a/src/agentscope/models/__init__.py b/src/agentscope/models/__init__.py
index 2984228d4..d7a98147e 100644
--- a/src/agentscope/models/__init__.py
+++ b/src/agentscope/models/__init__.py
@@ -26,6 +26,7 @@
DashScopeChatWrapper,
DashScopeImageSynthesisWrapper,
DashScopeTextEmbeddingWrapper,
+ DashScopeMultiModalWrapper,
)
from .ollama_model import (
OllamaChatWrapper,
@@ -55,6 +56,7 @@
"DashScopeChatWrapper",
"DashScopeImageSynthesisWrapper",
"DashScopeTextEmbeddingWrapper",
+ "DashScopeMultiModalWrapper",
"OllamaChatWrapper",
"OllamaEmbeddingWrapper",
"OllamaGenerationWrapper",
diff --git a/src/agentscope/models/dashscope_model.py b/src/agentscope/models/dashscope_model.py
index dc82b0831..c43429d0e 100644
--- a/src/agentscope/models/dashscope_model.py
+++ b/src/agentscope/models/dashscope_model.py
@@ -1,12 +1,13 @@
# -*- coding: utf-8 -*-
"""Model wrapper for DashScope models"""
+import os
from abc import ABC
from http import HTTPStatus
from typing import Any, Union, List, Sequence
from loguru import logger
from ..message import MessageBase
-from ..utils.tools import _convert_to_str
+from ..utils.tools import _convert_to_str, _guess_type_by_extension
try:
import dashscope
@@ -75,7 +76,9 @@ def format(
class DashScopeChatWrapper(DashScopeWrapperBase):
- """The model wrapper for DashScope's chat API."""
+ """The model wrapper for DashScope's chat API, refer to
+ https://help.aliyun.com/zh/dashscope/developer-reference/api-details
+ """
model_type: str = "dashscope_chat"
@@ -164,8 +167,6 @@ def __call__(
"and 'content' key for DashScope API.",
)
- # TODO: move is to prompt engineering
- messages = self._preprocess_role(messages)
# step3: forward to generate response
response = dashscope.Generation.call(
model=self.model_name,
@@ -262,8 +263,6 @@ def format(
`List[dict]`:
The formatted messages.
"""
- # TODO: This function only convert agentscope msgs into qwen
- # messages, the re-range is executed in _preprocess_role function.
# Parse all information into a list of messages
input_msgs = []
@@ -314,37 +313,11 @@ def format(
return messages
- def _preprocess_role(self, messages: list) -> list:
- """preprocess role rules for DashScope"""
- # The models in this list require that the roles of messages must
- # alternate between "user" and "assistant".
- message_length = len(messages)
- if message_length % 2 == 1:
- # If the length of the message list is odd, roles will
- # alternate, starting with "user"
- roles = [
- "user" if i % 2 == 0 else "assistant"
- for i in range(message_length)
- ]
- else:
- # If the length of the message list is even, the first role
- # will be "system", followed by alternating "user" and
- # "assistant"
- roles = ["system"] + [
- "user" if i % 2 == 1 else "assistant"
- for i in range(1, message_length)
- ]
-
- # Assign the roles list to the "role" key for each message in
- # the messages list
- for message, role in zip(messages, roles):
- message["role"] = role
-
- return messages
-
class DashScopeImageSynthesisWrapper(DashScopeWrapperBase):
- """The model wrapper for DashScope Image Synthesis API."""
+ """The model wrapper for DashScope Image Synthesis API, refer to
+ https://help.aliyun.com/zh/dashscope/developer-reference/quick-start-1
+ """
model_type: str = "dashscope_image_synthesis"
@@ -543,3 +516,335 @@ def __call__(
],
raw=response,
)
+
+
+class DashScopeMultiModalWrapper(DashScopeWrapperBase):
+ """The model wrapper for DashScope Multimodal API, refer to
+ https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-qianwen-vl-api
+ """
+
+ model_type: str = "dashscope_multimodal"
+
+ def _register_default_metrics(self) -> None:
+ # Set monitor accordingly
+ # TODO: set quota to the following metrics
+ self.monitor.register(
+ self._metric("call_counter"),
+ metric_unit="times",
+ )
+ self.monitor.register(
+ self._metric("prompt_tokens"),
+ metric_unit="token",
+ )
+ self.monitor.register(
+ self._metric("completion_tokens"),
+ metric_unit="token",
+ )
+ self.monitor.register(
+ self._metric("total_tokens"),
+ metric_unit="token",
+ )
+
+ def __call__(
+ self,
+ messages: list,
+ **kwargs: Any,
+ ) -> ModelResponse:
+ """Model call for DashScope MultiModal API.
+
+ Args:
+ messages (`list`):
+ A list of messages to process.
+ **kwargs (`Any`):
+ The keyword arguments to DashScope MultiModal API,
+ e.g. `stream`. Please refer to
+ https://help.aliyun.com/zh/dashscope/developer-reference/tongyi-qianwen-vl-plus-api
+ for more detailed arguments.
+
+ Returns:
+ `ModelResponse`:
+ The response text in text field, and the raw response in
+ raw field.
+
+ Note:
+ If involving image links, then the messages should be of the
+ following form:
+
+ .. code-block:: python
+
+ messages = [
+ {
+ "role": "system",
+ "content": [
+ {"text": "You are a helpful assistant."},
+ ],
+ },
+ {
+ "role": "user",
+ "content": [
+ {"text": "What does this picture depict?"},
+ {"image": "http://example.com/image.jpg"},
+ ],
+ },
+ ]
+
+ Therefore, you should input a list matching the content value
+ above.
+ If only involving words, just input them.
+
+ `parse_func`, `fault_handler` and `max_retries` are reserved
+ for `_response_parse_decorator` to parse and check the response
+ generated by model wrapper. Their usages are listed as follows:
+ - `parse_func` is a callable function used to parse and
+ check the response generated by the model, which takes the
+ response as input.
+ - `max_retries` is the maximum number of retries when the
+ `parse_func` raise an exception.
+ - `fault_handler` is a callable function which is called
+ when the response generated by the model is invalid after
+ `max_retries` retries.
+ """
+ # step1: prepare keyword arguments
+ kwargs = {**self.generate_args, **kwargs}
+
+ # step2: forward to generate response
+ response = dashscope.MultiModalConversation.call(
+ model=self.model_name,
+ messages=messages,
+ **kwargs,
+ )
+
+ if response.status_code != HTTPStatus.OK:
+ error_msg = (
+ f" Request id: {response.request_id},"
+ f" Status code: {response.status_code},"
+ f" error code: {response.code},"
+ f" error message: {response.message}."
+ )
+ raise RuntimeError(error_msg)
+
+ # step3: record the model api invocation if needed
+ self._save_model_invocation(
+ arguments={
+ "model": self.model_name,
+ "messages": messages,
+ **kwargs,
+ },
+ response=response,
+ )
+
+ # step4: update monitor accordingly
+ input_tokens = response.usage.get("input_tokens", 0)
+ image_tokens = response.usage.get("image_tokens", 0)
+ audio_tokens = response.usage.get("audio_tokens", 0)
+ output_tokens = response.usage.get("output_tokens", 0)
+ self.update_monitor(
+ call_counter=1,
+ prompt_tokens=input_tokens,
+ completion_tokens=output_tokens,
+ total_tokens=input_tokens
+ + output_tokens
+ + image_tokens
+ + audio_tokens,
+ )
+
+ # step5: return response
+ return ModelResponse(
+ text=response.output["choices"][0]["message"]["content"][0][
+ "text"
+ ],
+ raw=response,
+ )
+
+ def format(
+ self,
+ *args: Union[MessageBase, Sequence[MessageBase]],
+ ) -> List:
+ """Format the messages for DashScope Multimodal API.
+
+ The multimodal API has the following requirements:
+ - The roles of messages must alternate between "user" and
+ "assistant".
+ - The message with the role "system" should be the first message
+ in the list.
+ - If the system message exists, then the second message must
+ have the role "user".
+ - The last message in the list should have the role "user".
+ - In each message, more than one figure is allowed.
+
+ With the above requirements, we format the messages as follows:
+ - If the first message is a system message, then we will keep it as
+ system prompt.
+ - We merge all messages into a dialogue history prompt in a single
+ message with the role "user".
+ - When there are multiple figures in the given messages, we will
+ attach it to the user message by order. Note if there are multiple
+ figures, this strategy may cause misunderstanding for the model. For
+ advanced solutions, developers are encouraged to implement their own
+ prompt engineering strategies.
+
+ The following is an example:
+
+ .. code-block:: python
+
+ prompt = model.format(
+ Msg(
+ "system",
+ "You're a helpful assistant",
+ role="system", url="figure1"
+ ),
+ Msg(
+ "Bob",
+ "How about this picture?",
+ role="assistant", url="figure2"
+ ),
+ Msg(
+ "user",
+ "It's wonderful! How about mine?",
+ role="user", image="figure3"
+ )
+ )
+
+ The prompt will be as follows:
+
+ .. code-block:: python
+
+ [
+ {
+ "role": "system",
+ "content": [
+ {"text": "You are a helpful assistant"},
+ {"image": "figure1"}
+ ]
+ },
+ {
+ "role": "user",
+ "content": [
+ {"image": "figure2"},
+ {"image": "figure3"},
+ {
+ "text": (
+ "## Dialogue History\n"
+ "Bob: How about this picture?\n"
+ "user: It's wonderful! How about mine?"
+ )
+ },
+ ]
+ }
+ ]
+
+ Note:
+ In multimodal API, the url of local files should be prefixed with
+ "file://", which will be attached in this format function.
+
+ Args:
+ args (`Union[MessageBase, Sequence[MessageBase]]`):
+ The input arguments to be formatted, where each argument
+ should be a `Msg` object, or a list of `Msg` objects.
+ In distribution, placeholder is also allowed.
+
+ Returns:
+ `List[dict]`:
+ The formatted messages.
+ """
+
+ # Parse all information into a list of messages
+ input_msgs = []
+ for _ in args:
+ if isinstance(_, MessageBase):
+ input_msgs.append(_)
+ elif isinstance(_, list) and all(
+ isinstance(__, MessageBase) for __ in _
+ ):
+ input_msgs.extend(_)
+ else:
+ raise TypeError(
+ f"The input should be a Msg object or a list "
+ f"of Msg objects, got {type(_)}.",
+ )
+
+ messages = []
+
+ # record dialog history as a list of strings
+ dialogue = []
+ image_or_audio_dicts = []
+ for i, unit in enumerate(input_msgs):
+ if i == 0 and unit.role == "system":
+ # system prompt
+ content = self._convert_url(unit.url)
+ content.append({"text": _convert_to_str(unit.content)})
+
+ messages.append(
+ {
+ "role": unit.role,
+ "content": content,
+ },
+ )
+ else:
+ # text message
+ dialogue.append(
+ f"{unit.name}: {_convert_to_str(unit.content)}",
+ )
+ # image and audio
+ image_or_audio_dicts.extend(self._convert_url(unit.url))
+
+ dialogue_history = "\n".join(dialogue)
+
+ user_content_template = "## Dialogue History\n{dialogue_history}"
+
+ messages.append(
+ {
+ "role": "user",
+ "content": [
+ # Place the image or audio before the dialogue history
+ *image_or_audio_dicts,
+ {
+ "text": user_content_template.format(
+ dialogue_history=dialogue_history,
+ ),
+ },
+ ],
+ },
+ )
+
+ return messages
+
+ def _convert_url(self, url: Union[str, Sequence[str], None]) -> List[dict]:
+ """Convert the url to the format of DashScope API. Note for local
+ files, a prefix "file://" will be added.
+
+ Args:
+ url (`Union[str, Sequence[str], None]`):
+ A string of url of a list of urls to be converted.
+
+ Returns:
+ `List[dict]`:
+ A list of dictionaries with key as the type of the url
+ and value as the url. Only "image" and "audio" are supported.
+ """
+ if url is None:
+ return []
+
+ if isinstance(url, str):
+ url_type = _guess_type_by_extension(url)
+ if url_type in ["audio", "image"]:
+ # Add prefix for local files
+ if os.path.exists(url):
+ url = "file://" + url
+ return [{url_type: url}]
+ else:
+ # skip unsupported url
+ logger.warning(
+ f"Skip unsupported url ({url_type}), "
+ f"expect image or audio.",
+ )
+ return []
+ elif isinstance(url, list):
+ dicts = []
+ for _ in url:
+ dicts.extend(self._convert_url(_))
+ return dicts
+ else:
+ raise TypeError(
+ f"Unsupported url type {type(url)}, " f"str or list expected.",
+ )
diff --git a/src/agentscope/utils/tools.py b/src/agentscope/utils/tools.py
index 9a00d8201..47c18e5d3 100644
--- a/src/agentscope/utils/tools.py
+++ b/src/agentscope/utils/tools.py
@@ -5,7 +5,7 @@
import json
import secrets
import string
-from typing import Any
+from typing import Any, Literal
from urllib.parse import urlparse
@@ -60,6 +60,64 @@ def to_dialog_str(item: dict) -> str:
return f"{speaker}: {content}"
+def _guess_type_by_extension(
+ url: str,
+) -> Literal["image", "audio", "video", "file"]:
+ """Guess the type of the file by its extension."""
+ extension = url.split(".")[-1].lower()
+
+ if extension in [
+ "bmp",
+ "dib",
+ "icns",
+ "ico",
+ "jfif",
+ "jpe",
+ "jpeg",
+ "jpg",
+ "j2c",
+ "j2k",
+ "jp2",
+ "jpc",
+ "jpf",
+ "jpx",
+ "apng",
+ "png",
+ "bw",
+ "rgb",
+ "rgba",
+ "sgi",
+ "tif",
+ "tiff",
+ "webp",
+ ]:
+ return "image"
+ elif extension in [
+ "amr",
+ "wav",
+ "3gp",
+ "3gpp",
+ "aac",
+ "mp3",
+ "flac",
+ "ogg",
+ ]:
+ return "audio"
+ elif extension in [
+ "mp4",
+ "webm",
+ "mkv",
+ "flv",
+ "avi",
+ "mov",
+ "wmv",
+ "rmvb",
+ ]:
+ return "video"
+ else:
+ return "file"
+
+
def _to_openai_image_url(url: str) -> str:
"""Convert an image url to openai format. If the given url is a local
file, it will be converted to base64 format. Otherwise, it will be
diff --git a/tests/dashscope_test.py b/tests/dashscope_test.py
index 8164e21c8..0a6bec7d3 100644
--- a/tests/dashscope_test.py
+++ b/tests/dashscope_test.py
@@ -3,11 +3,12 @@
import unittest
from unittest.mock import patch, MagicMock
-from agentscope.models import ( # pylint: disable=no-name-in-module
+from agentscope.models import (
ModelResponse,
DashScopeChatWrapper,
DashScopeImageSynthesisWrapper,
DashScopeTextEmbeddingWrapper,
+ DashScopeMultiModalWrapper,
)
@@ -252,5 +253,98 @@ def test_call_failure(self, mock_call: MagicMock) -> None:
)
+class TestDashScopeMultiModalWrapper(unittest.TestCase):
+ """Test DashScope MultiModal Wrapper"""
+
+ def setUp(self) -> None:
+ # Initialize DashScopeMultiModalWrapper instance
+ self.wrapper = DashScopeMultiModalWrapper(
+ config_name="test_config",
+ model_name="test_model",
+ api_key="test_key",
+ )
+
+ @patch(
+ "agentscope.models.dashscope_model."
+ "dashscope.MultiModalConversation.call",
+ )
+ def test_call_success(self, mock_call: MagicMock) -> None:
+ """Test call success"""
+ # Mocking the response from the API
+ mock_response = MagicMock()
+ mock_response.status_code = 200
+ mock_response.output = {
+ "choices": [
+ {"message": {"content": [{"text": "This is the result."}]}},
+ ],
+ }
+ mock_response.usage = {
+ "input_tokens": 23,
+ "output_tokens": 5,
+ "image_tokens": 17,
+ }
+ mock_call.return_value = mock_response
+
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"text": "What does this picture depict?"},
+ {"image": "http://example.com/image.jpg"},
+ ],
+ },
+ ]
+ # Calling the wrapper and validating the response
+ response = self.wrapper(messages=messages)
+ self.assertIsInstance(response, ModelResponse)
+ self.assertEqual(response.text, "This is the result.")
+ self.assertEqual(response.raw, mock_response)
+
+ # Verify call to dashscope.MultiModalConversation.call
+ mock_call.assert_called_once_with(
+ model=self.wrapper.model_name,
+ messages=messages,
+ )
+
+ @patch(
+ "agentscope.models.dashscope_model."
+ "dashscope.MultiModalConversation.call",
+ )
+ def test_call_failure(self, mock_call: MagicMock) -> None:
+ """Test call failure"""
+ # Simulating a failed API call
+ mock_response = MagicMock()
+ mock_response.status_code = 400
+ mock_response.request_id = "Test_request_id"
+ mock_response.code = "Error Code"
+ mock_response.message = "Error Message"
+ mock_call.return_value = mock_response
+
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"text": "What does this picture depict?"},
+ {"image": "http://example.com/image.jpg"},
+ ],
+ },
+ ]
+ # Expecting a RuntimeError to be raised
+ with self.assertRaises(RuntimeError) as context:
+ self.wrapper(messages=messages)
+
+ # Assert the expected exception message
+ self.assertIn("Error Code", str(context.exception))
+ self.assertIn("Error Message", str(context.exception))
+ self.assertIn("Test_request_id", str(context.exception))
+ self.assertIn("400", str(context.exception))
+
+ # Verify call to dashscope.MultiModalConversation.call
+ mock_call.assert_called_once_with(
+ model=self.wrapper.model_name,
+ messages=messages,
+ )
+
+
if __name__ == "__main__":
unittest.main()
diff --git a/tests/format_test.py b/tests/format_test.py
index 452c1cdd2..589398837 100644
--- a/tests/format_test.py
+++ b/tests/format_test.py
@@ -10,6 +10,7 @@
OllamaGenerationWrapper,
GeminiChatWrapper,
DashScopeChatWrapper,
+ DashScopeMultiModalWrapper,
)
@@ -172,6 +173,132 @@ def test_dashscope_chat(self) -> None:
with self.assertRaises(TypeError):
model.format(*self.wrong_inputs) # type: ignore[arg-type]
+ def test_dashscope_multimodal_image(self) -> None:
+ """Unit test for the format function in dashscope multimodal
+ conversation api wrapper for image."""
+ model = DashScopeMultiModalWrapper(
+ config_name="",
+ model_name="qwen-vl-plus",
+ api_key="xxx",
+ )
+
+ multimodal_input = [
+ Msg(
+ "system",
+ "You are a helpful assistant",
+ role="system",
+ url="url1.png",
+ ),
+ [
+ Msg(
+ "user",
+ "What is the weather today?",
+ role="user",
+ url="url2.png",
+ ),
+ Msg(
+ "assistant",
+ "It is sunny today",
+ role="assistant",
+ url="url3.png",
+ ),
+ ],
+ ]
+
+ ground_truth = [
+ {
+ "role": "system",
+ "content": [
+ {"image": "url1.png"},
+ {"text": "You are a helpful assistant"},
+ ],
+ },
+ {
+ "role": "user",
+ "content": [
+ {"image": "url2.png"},
+ {"image": "url3.png"},
+ {
+ "text": (
+ "## Dialogue History\n"
+ "user: What is the weather today?\n"
+ "assistant: It is sunny today"
+ ),
+ },
+ ],
+ },
+ ]
+
+ prompt = model.format(*multimodal_input)
+ self.assertListEqual(prompt, ground_truth)
+
+ # wrong format
+ with self.assertRaises(TypeError):
+ model.format(*self.wrong_inputs)
+
+ def test_dashscope_multimodal_audio(self) -> None:
+ """Unit test for the format function in dashscope multimodal
+ conversation api wrapper for audio."""
+ model = DashScopeMultiModalWrapper(
+ config_name="",
+ model_name="qwen-audio-turbo",
+ api_key="xxx",
+ )
+
+ multimodal_input = [
+ Msg(
+ "system",
+ "You are a helpful assistant",
+ role="system",
+ url="url1.mp3",
+ ),
+ [
+ Msg(
+ "user",
+ "What is the weather today?",
+ role="user",
+ url="url2.mp3",
+ ),
+ Msg(
+ "assistant",
+ "It is sunny today",
+ role="assistant",
+ url="url3.mp3",
+ ),
+ ],
+ ]
+
+ ground_truth = [
+ {
+ "role": "system",
+ "content": [
+ {"audio": "url1.mp3"},
+ {"text": "You are a helpful assistant"},
+ ],
+ },
+ {
+ "role": "user",
+ "content": [
+ {"audio": "url2.mp3"},
+ {"audio": "url3.mp3"},
+ {
+ "text": (
+ "## Dialogue History\n"
+ "user: What is the weather today?\n"
+ "assistant: It is sunny today"
+ ),
+ },
+ ],
+ },
+ ]
+
+ prompt = model.format(*multimodal_input)
+ self.assertListEqual(prompt, ground_truth)
+
+ # wrong format
+ with self.assertRaises(TypeError):
+ model.format(*self.wrong_inputs)
+
if __name__ == "__main__":
unittest.main()