diff --git a/dotnet/sample/AutoGen.BasicSamples/AutoGen.BasicSample.csproj b/dotnet/sample/AutoGen.BasicSamples/AutoGen.BasicSample.csproj index c4e41261933..3c2b5166988 100644 --- a/dotnet/sample/AutoGen.BasicSamples/AutoGen.BasicSample.csproj +++ b/dotnet/sample/AutoGen.BasicSamples/AutoGen.BasicSample.csproj @@ -16,4 +16,10 @@ + + + + PreserveNewest + + diff --git a/dotnet/sample/AutoGen.BasicSamples/Example15_GPT4V_BinaryDataImageMessage.cs b/dotnet/sample/AutoGen.BasicSamples/Example15_GPT4V_BinaryDataImageMessage.cs new file mode 100644 index 00000000000..7a3422cb863 --- /dev/null +++ b/dotnet/sample/AutoGen.BasicSamples/Example15_GPT4V_BinaryDataImageMessage.cs @@ -0,0 +1,62 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Example15_ImageMessage.cs + +using AutoGen.Core; +using AutoGen.OpenAI; + +namespace AutoGen.BasicSample; + +/// +/// This example shows usage of ImageMessage. The image is loaded as BinaryData and sent to GPT-4V +///
+///
+/// Add additional images to the ImageResources to load and send more images to GPT-4V +///
+public static class Example15_GPT4V_BinaryDataImageMessage +{ + private static readonly string ImageResourcePath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ImageResources"); + + private static Dictionary _mediaTypeMappings = new() + { + { ".png", "image/png" }, + { ".jpeg", "image/jpeg" }, + { ".jpg", "image/jpeg" }, + { ".gif", "image/gif" }, + { ".webp", "image/webp" } + }; + + public static async Task RunAsync() + { + var openAIKey = Environment.GetEnvironmentVariable("OPENAI_API_KEY") ?? throw new Exception("Please set OPENAI_API_KEY environment variable."); + var openAiConfig = new OpenAIConfig(openAIKey, "gpt-4-vision-preview"); + + var visionAgent = new GPTAgent( + name: "gpt", + systemMessage: "You are a helpful AI assistant", + config: openAiConfig, + temperature: 0); + + List messages = + [new TextMessage(Role.User, "What is this image?", from: "user")]; + AddMessagesFromResource(ImageResourcePath, messages); + + var multiModalMessage = new MultiModalMessage(Role.User, messages, from: "user"); + var response = await visionAgent.SendAsync(multiModalMessage); + } + + private static void AddMessagesFromResource(string imageResourcePath, List messages) + { + foreach (string file in Directory.GetFiles(imageResourcePath)) + { + if (!_mediaTypeMappings.TryGetValue(Path.GetExtension(file).ToLowerInvariant(), out var mediaType)) + continue; + + using var fs = new FileStream(file, FileMode.Open, FileAccess.Read); + var ms = new MemoryStream(); + fs.CopyTo(ms); + ms.Seek(0, SeekOrigin.Begin); + var imageData = BinaryData.FromStream(ms, mediaType); + messages.Add(new ImageMessage(Role.Assistant, imageData, from: "user")); + } + } +} diff --git a/dotnet/sample/AutoGen.BasicSamples/ImageResources/square.png b/dotnet/sample/AutoGen.BasicSamples/ImageResources/square.png new file mode 100644 index 00000000000..afb4f4cd4df --- /dev/null +++ b/dotnet/sample/AutoGen.BasicSamples/ImageResources/square.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8323d0b8eceb752e14c29543b2e28bb2fc648ed9719095c31b7708867a4dc918 +size 491 diff --git a/dotnet/src/AutoGen.Core/AutoGen.Core.csproj b/dotnet/src/AutoGen.Core/AutoGen.Core.csproj index 409b6bc1aaf..ebbec3f0a46 100644 --- a/dotnet/src/AutoGen.Core/AutoGen.Core.csproj +++ b/dotnet/src/AutoGen.Core/AutoGen.Core.csproj @@ -16,6 +16,7 @@ + diff --git a/dotnet/src/AutoGen.Core/Message/ImageMessage.cs b/dotnet/src/AutoGen.Core/Message/ImageMessage.cs index 18ceea0d111..1239785c411 100644 --- a/dotnet/src/AutoGen.Core/Message/ImageMessage.cs +++ b/dotnet/src/AutoGen.Core/Message/ImageMessage.cs @@ -21,14 +21,41 @@ public ImageMessage(Role role, Uri uri, string? from = null) this.Url = uri.ToString(); } + public ImageMessage(Role role, BinaryData data, string? from = null) + { + if (data.IsEmpty) + { + throw new ArgumentException("Data cannot be empty", nameof(data)); + } + + if (string.IsNullOrWhiteSpace(data.MediaType)) + { + throw new ArgumentException("MediaType is needed for DataUri Images", nameof(data)); + } + + this.Role = role; + this.From = from; + this.Data = data; + } + public Role Role { get; set; } - public string Url { get; set; } + public string? Url { get; set; } public string? From { get; set; } + public BinaryData? Data { get; set; } + + public string BuildDataUri() + { + if (this.Data is null) + throw new NullReferenceException($"{nameof(Data)}"); + + return $"data:{this.Data.MediaType};base64,{Convert.ToBase64String(this.Data.ToArray())}"; + } + public override string ToString() { - return $"ImageMessage({this.Role}, {this.Url}, {this.From})"; + return $"ImageMessage({this.Role}, {(this.Data != null ? BuildDataUri() : this.Url) ?? string.Empty}, {this.From})"; } } diff --git a/dotnet/src/AutoGen.Mistral/DTOs/ChatCompletionResponse.cs b/dotnet/src/AutoGen.Mistral/DTOs/ChatCompletionResponse.cs index ff241f8d340..13e29e7139b 100644 --- a/dotnet/src/AutoGen.Mistral/DTOs/ChatCompletionResponse.cs +++ b/dotnet/src/AutoGen.Mistral/DTOs/ChatCompletionResponse.cs @@ -1,4 +1,4 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) Microsoft Corporation. All rights reserved. // ChatCompletionResponse.cs using System.Collections.Generic; diff --git a/dotnet/src/AutoGen.Mistral/DTOs/Error.cs b/dotnet/src/AutoGen.Mistral/DTOs/Error.cs index 77eb2d341fb..8bddcfc776c 100644 --- a/dotnet/src/AutoGen.Mistral/DTOs/Error.cs +++ b/dotnet/src/AutoGen.Mistral/DTOs/Error.cs @@ -1,4 +1,4 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) Microsoft Corporation. All rights reserved. // Error.cs using System.Text.Json.Serialization; diff --git a/dotnet/src/AutoGen.Mistral/DTOs/Model.cs b/dotnet/src/AutoGen.Mistral/DTOs/Model.cs index 915d2f737ec..70a4b3c997d 100644 --- a/dotnet/src/AutoGen.Mistral/DTOs/Model.cs +++ b/dotnet/src/AutoGen.Mistral/DTOs/Model.cs @@ -1,4 +1,4 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) Microsoft Corporation. All rights reserved. // Model.cs using System; diff --git a/dotnet/src/AutoGen.OpenAI/Extension/MessageExtension.cs b/dotnet/src/AutoGen.OpenAI/Extension/MessageExtension.cs index 92e0f3776f5..b3dfb1e8668 100644 --- a/dotnet/src/AutoGen.OpenAI/Extension/MessageExtension.cs +++ b/dotnet/src/AutoGen.OpenAI/Extension/MessageExtension.cs @@ -77,7 +77,7 @@ public static IEnumerable ToOpenAIChatRequestMessage(this IA else if (message is ImageMessage imageMessage) { // multi-modal - var msg = new ChatRequestUserMessage(new ChatMessageImageContentItem(new Uri(imageMessage.Url))); + var msg = new ChatRequestUserMessage(new ChatMessageImageContentItem(new Uri(imageMessage.Url ?? imageMessage.BuildDataUri()))); return [msg]; } @@ -101,7 +101,7 @@ public static IEnumerable ToOpenAIChatRequestMessage(this IA return m switch { TextMessage textMessage => new ChatMessageTextContentItem(textMessage.Content), - ImageMessage imageMessage => new ChatMessageImageContentItem(new Uri(imageMessage.Url)), + ImageMessage imageMessage => new ChatMessageImageContentItem(new Uri(imageMessage.Url ?? imageMessage.BuildDataUri())), _ => throw new ArgumentException($"Unknown message type: {m.GetType()}") }; }); diff --git a/dotnet/src/AutoGen.OpenAI/Middleware/OpenAIChatRequestMessageConnector.cs b/dotnet/src/AutoGen.OpenAI/Middleware/OpenAIChatRequestMessageConnector.cs index c1581cbec08..1276e93f9fb 100644 --- a/dotnet/src/AutoGen.OpenAI/Middleware/OpenAIChatRequestMessageConnector.cs +++ b/dotnet/src/AutoGen.OpenAI/Middleware/OpenAIChatRequestMessageConnector.cs @@ -336,7 +336,7 @@ private IEnumerable ProcessIncomingMessagesForOther(TextMess private IEnumerable ProcessIncomingMessagesForOther(ImageMessage message) { return new[] { new ChatRequestUserMessage([ - new ChatMessageImageContentItem(new Uri(message.Url)), + new ChatMessageImageContentItem(new Uri(message.Url ?? message.BuildDataUri())), ])}; } @@ -345,7 +345,7 @@ private IEnumerable ProcessIncomingMessagesForOther(MultiMod IEnumerable items = message.Content.Select(ci => ci switch { TextMessage text => new ChatMessageTextContentItem(text.Content), - ImageMessage image => new ChatMessageImageContentItem(new Uri(image.Url)), + ImageMessage image => new ChatMessageImageContentItem(new Uri(image.Url ?? image.BuildDataUri())), _ => throw new NotImplementedException(), }); diff --git a/dotnet/src/AutoGen.SemanticKernel/Middleware/SemanticKernelChatMessageContentConnector.cs b/dotnet/src/AutoGen.SemanticKernel/Middleware/SemanticKernelChatMessageContentConnector.cs index e4b7527cd05..557683c9615 100644 --- a/dotnet/src/AutoGen.SemanticKernel/Middleware/SemanticKernelChatMessageContentConnector.cs +++ b/dotnet/src/AutoGen.SemanticKernel/Middleware/SemanticKernelChatMessageContentConnector.cs @@ -92,7 +92,7 @@ private IMessage PostProcessMessage(IMessage messageEnvelope { TextContent txt => new TextMessage(Role.Assistant, txt.Text!, messageEnvelope.From), ImageContent img when img.Uri is Uri uri => new ImageMessage(Role.Assistant, uri.ToString(), from: messageEnvelope.From), - ImageContent img when img.Uri is null => throw new InvalidOperationException("ImageContent.Uri is null"), + ImageContent img when img.Data is ReadOnlyMemory data => new ImageMessage(Role.Assistant, BinaryData.FromBytes(data), from: messageEnvelope.From), _ => throw new InvalidOperationException("Unsupported content type"), }); @@ -185,9 +185,8 @@ private IEnumerable ProcessMessageForOthers(TextMessage mess private IEnumerable ProcessMessageForOthers(ImageMessage message) { - var imageContent = new ImageContent(new Uri(message.Url)); var collectionItems = new ChatMessageContentItemCollection(); - collectionItems.Add(imageContent); + collectionItems.Add(new ImageContent(new Uri(message.Url ?? message.BuildDataUri()))); return [new ChatMessageContent(AuthorRole.User, collectionItems)]; } @@ -207,7 +206,7 @@ private IEnumerable ProcessMessageForOthers(MultiModalMessag } else if (item is ImageMessage imageContent) { - collections.Add(new ImageContent(new Uri(imageContent.Url))); + collections.Add(new ImageContent(new Uri(imageContent.Url ?? imageContent.BuildDataUri()))); } else { diff --git a/dotnet/test/AutoGen.Tests/ApprovalTests/square.png b/dotnet/test/AutoGen.Tests/ApprovalTests/square.png new file mode 100644 index 00000000000..afb4f4cd4df --- /dev/null +++ b/dotnet/test/AutoGen.Tests/ApprovalTests/square.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8323d0b8eceb752e14c29543b2e28bb2fc648ed9719095c31b7708867a4dc918 +size 491 diff --git a/dotnet/test/AutoGen.Tests/AutoGen.Tests.csproj b/dotnet/test/AutoGen.Tests/AutoGen.Tests.csproj index f7e6b036506..9a7b07b34dd 100644 --- a/dotnet/test/AutoGen.Tests/AutoGen.Tests.csproj +++ b/dotnet/test/AutoGen.Tests/AutoGen.Tests.csproj @@ -21,4 +21,10 @@ + + + PreserveNewest + + + diff --git a/dotnet/test/AutoGen.Tests/BasicSampleTest.cs b/dotnet/test/AutoGen.Tests/BasicSampleTest.cs index 19de2bdef4b..b9eea67397c 100644 --- a/dotnet/test/AutoGen.Tests/BasicSampleTest.cs +++ b/dotnet/test/AutoGen.Tests/BasicSampleTest.cs @@ -68,6 +68,12 @@ public async Task DalleAndGPT4VTestAsync() await Example05_Dalle_And_GPT4V.RunAsync(); } + [ApiKeyFact("OPENAI_API_KEY")] + public async Task GPT4ImageMessage() + { + await Example15_GPT4V_BinaryDataImageMessage.RunAsync(); + } + public class ConsoleWriter : StringWriter { private ITestOutputHelper output; diff --git a/dotnet/test/AutoGen.Tests/SingleAgentTest.cs b/dotnet/test/AutoGen.Tests/SingleAgentTest.cs index d314b391bae..6dfb61761eb 100644 --- a/dotnet/test/AutoGen.Tests/SingleAgentTest.cs +++ b/dotnet/test/AutoGen.Tests/SingleAgentTest.cs @@ -3,6 +3,7 @@ using System; using System.Collections.Generic; +using System.IO; using System.Linq; using System.Threading.Tasks; using AutoGen.OpenAI; @@ -80,11 +81,24 @@ public async Task GPTAgentVisionTestAsync() var imageMessage = new ImageMessage(Role.User, imageUri, from: "user"); + string imagePath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "ApprovalTests", "square.png"); + ImageMessage imageMessageData; + using (var fs = new FileStream(imagePath, FileMode.Open, FileAccess.Read)) + { + var ms = new MemoryStream(); + await fs.CopyToAsync(ms); + ms.Seek(0, SeekOrigin.Begin); + var imageData = await BinaryData.FromStreamAsync(ms, "image/png"); + imageMessageData = new ImageMessage(Role.Assistant, imageData, from: "user"); + } + IMessage[] messages = [ MessageEnvelope.Create(oaiMessage), multiModalMessage, imageMessage, + imageMessageData ]; + foreach (var message in messages) { var response = await visionAgent.SendAsync(message);