From f3633e2985e8a6bf0a9f2571c552828fec7e91fa Mon Sep 17 00:00:00 2001 From: westey <164392973+westey-m@users.noreply.github.com> Date: Mon, 4 Nov 2024 13:44:12 +0000 Subject: [PATCH] .Net: Add sample to show how to create a decorator for generating embeddings. (#9502) ### Motivation and Context Generating embeddings directly every time can be frustrating and it's nice to hide this functionality away from the main code #9483 ### Description Adding a sample that shows how to create a set of decorator classes for Vector Stores to automatically generate embeddings on upsert and when using VectorizableTextSearch. ### Contribution Checklist - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone :smile: --- .../GenerateTextEmbeddingAttribute.cs | 39 ++++ .../TextEmbeddingVectorStore.cs | 48 +++++ .../TextEmbeddingVectorStoreExtensions.cs | 40 ++++ ...extEmbeddingVectorStoreRecordCollection.cs | 193 ++++++++++++++++++ .../Memory/VectorStore_EmbeddingGeneration.cs | 120 +++++++++++ 5 files changed, 440 insertions(+) create mode 100644 dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/GenerateTextEmbeddingAttribute.cs create mode 100644 dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStore.cs create mode 100644 dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStoreExtensions.cs create mode 100644 dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStoreRecordCollection.cs create mode 100644 dotnet/samples/Concepts/Memory/VectorStore_EmbeddingGeneration.cs diff --git a/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/GenerateTextEmbeddingAttribute.cs b/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/GenerateTextEmbeddingAttribute.cs new file mode 100644 index 000000000000..9a8e6b17aa27 --- /dev/null +++ b/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/GenerateTextEmbeddingAttribute.cs @@ -0,0 +1,39 @@ +// Copyright (c) Microsoft. All rights reserved. + +namespace Memory.VectorStoreEmbeddingGeneration; + +/// +/// An attribute that can be used for an embedding property to indicate that it should +/// be generated from one or more text properties located on the same class. +/// +/// +/// This class is part of the sample. +/// +[AttributeUsage(AttributeTargets.Property, AllowMultiple = false, Inherited = true)] +public sealed class GenerateTextEmbeddingAttribute : Attribute +{ + /// + /// Initializes a new instance of the class. + /// + /// The name of the property that the embedding should be generated from. +#pragma warning disable CA1019 // Define accessors for attribute arguments + public GenerateTextEmbeddingAttribute(string sourcePropertyName) +#pragma warning restore CA1019 // Define accessors for attribute arguments + { + this.SourcePropertyNames = [sourcePropertyName]; + } + + /// + /// Initializes a new instance of the class. + /// + /// The names of the properties that the embedding should be generated from. + public GenerateTextEmbeddingAttribute(string[] sourcePropertyNames) + { + this.SourcePropertyNames = sourcePropertyNames; + } + + /// + /// Gets the name of the property to use as the source for generating the embedding. + /// + public string[] SourcePropertyNames { get; } +} diff --git a/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStore.cs b/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStore.cs new file mode 100644 index 000000000000..6848b38af48f --- /dev/null +++ b/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStore.cs @@ -0,0 +1,48 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.Extensions.VectorData; +using Microsoft.SemanticKernel.Embeddings; + +namespace Memory.VectorStoreEmbeddingGeneration; + +/// +/// Decorator for a that generates embeddings for records on upsert. +/// +/// +/// This class is part of the sample. +/// +public class TextEmbeddingVectorStore : IVectorStore +{ + /// The decorated . + private readonly IVectorStore _decoratedVectorStore; + + /// The service to use for generating the embeddings. + private readonly ITextEmbeddingGenerationService _textEmbeddingGenerationService; + + /// + /// Initializes a new instance of the class. + /// + /// The decorated . + /// The service to use for generating the embeddings. + public TextEmbeddingVectorStore(IVectorStore decoratedVectorStore, ITextEmbeddingGenerationService textEmbeddingGenerationService) + { + // Verify & Assign. + this._decoratedVectorStore = decoratedVectorStore ?? throw new ArgumentNullException(nameof(decoratedVectorStore)); + this._textEmbeddingGenerationService = textEmbeddingGenerationService ?? throw new ArgumentNullException(nameof(textEmbeddingGenerationService)); + } + + /// + public IVectorStoreRecordCollection GetCollection(string name, VectorStoreRecordDefinition? vectorStoreRecordDefinition = null) + where TKey : notnull + { + var collection = this._decoratedVectorStore.GetCollection(name, vectorStoreRecordDefinition); + var embeddingStore = new TextEmbeddingVectorStoreRecordCollection(collection, this._textEmbeddingGenerationService); + return embeddingStore; + } + + /// + public IAsyncEnumerable ListCollectionNamesAsync(CancellationToken cancellationToken = default) + { + return this._decoratedVectorStore.ListCollectionNamesAsync(cancellationToken); + } +} diff --git a/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStoreExtensions.cs b/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStoreExtensions.cs new file mode 100644 index 000000000000..e1b6c779fdb8 --- /dev/null +++ b/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStoreExtensions.cs @@ -0,0 +1,40 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.Extensions.VectorData; +using Microsoft.SemanticKernel.Embeddings; + +namespace Memory.VectorStoreEmbeddingGeneration; + +/// +/// Contains extension methods to help add text embedding generation to a or +/// +/// +/// This class is part of the sample. +/// +public static class TextEmbeddingVectorStoreExtensions +{ + /// + /// Add text embedding generation to a . + /// + /// The to add text embedding generation to. + /// The service to use for generating text embeddings. + /// The with text embedding added. + public static IVectorStore UseTextEmbeddingGeneration(this IVectorStore vectorStore, ITextEmbeddingGenerationService textEmbeddingGenerationService) + { + return new TextEmbeddingVectorStore(vectorStore, textEmbeddingGenerationService); + } + + /// + /// Add text embedding generation to a . + /// + /// The to add text embedding generation to. + /// The service to use for generating text embeddings. + /// The data type of the record key. + /// The record data model to use for adding, updating and retrieving data from the store. + /// The with text embedding added. + public static IVectorStoreRecordCollection UseTextEmbeddingGeneration(this IVectorStoreRecordCollection vectorStoreRecordCollection, ITextEmbeddingGenerationService textEmbeddingGenerationService) + where TKey : notnull + { + return new TextEmbeddingVectorStoreRecordCollection(vectorStoreRecordCollection, textEmbeddingGenerationService); + } +} diff --git a/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStoreRecordCollection.cs b/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStoreRecordCollection.cs new file mode 100644 index 000000000000..a3c5517653af --- /dev/null +++ b/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStoreRecordCollection.cs @@ -0,0 +1,193 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Reflection; +using System.Runtime.CompilerServices; +using Microsoft.Extensions.VectorData; +using Microsoft.SemanticKernel.Embeddings; + +namespace Memory.VectorStoreEmbeddingGeneration; + +/// +/// Decorator for a that generates embeddings for records on upsert and when using . +/// +/// +/// This class is part of the sample. +/// +/// The data type of the record key. +/// The record data model to use for adding, updating and retrieving data from the store. +#pragma warning disable CA1711 // Identifiers should not have incorrect suffix +public class TextEmbeddingVectorStoreRecordCollection : IVectorStoreRecordCollection, IVectorizableTextSearch +#pragma warning restore CA1711 // Identifiers should not have incorrect suffix + where TKey : notnull +{ + /// The decorated . + private readonly IVectorStoreRecordCollection _decoratedVectorStoreRecordCollection; + + /// The service to use for generating the embeddings. + private readonly ITextEmbeddingGenerationService _textEmbeddingGenerationService; + + /// Optional configuration options for this class. + private readonly IEnumerable<(PropertyInfo EmbeddingPropertyInfo, IList SourcePropertiesInfo)> _embeddingPropertiesWithSourceProperties; + + /// + /// Initializes a new instance of the class. + /// + /// The decorated . + /// The service to use for generating the embeddings. + /// Thrown when embedding properties are referencing data source properties that do not exist. + /// Thrown when required parameters are null. + public TextEmbeddingVectorStoreRecordCollection(IVectorStoreRecordCollection decoratedVectorStoreRecordCollection, ITextEmbeddingGenerationService textEmbeddingGenerationService) + { + // Assign. + this._decoratedVectorStoreRecordCollection = decoratedVectorStoreRecordCollection ?? throw new ArgumentNullException(nameof(decoratedVectorStoreRecordCollection)); + this._textEmbeddingGenerationService = textEmbeddingGenerationService ?? throw new ArgumentNullException(nameof(textEmbeddingGenerationService)); + + // Find all the embedding properties to generate embeddings for. + this._embeddingPropertiesWithSourceProperties = FindDataPropertiesWithEmbeddingProperties(typeof(TRecord)); + } + + /// + public string CollectionName => this._decoratedVectorStoreRecordCollection.CollectionName; + + /// + public Task CollectionExistsAsync(CancellationToken cancellationToken = default) + { + return this._decoratedVectorStoreRecordCollection.CollectionExistsAsync(cancellationToken); + } + + /// + public Task CreateCollectionAsync(CancellationToken cancellationToken = default) + { + return this._decoratedVectorStoreRecordCollection.CreateCollectionAsync(cancellationToken); + } + + /// + public async Task CreateCollectionIfNotExistsAsync(CancellationToken cancellationToken = default) + { + if (!await this.CollectionExistsAsync(cancellationToken).ConfigureAwait(false)) + { + await this.CreateCollectionAsync(cancellationToken).ConfigureAwait(false); + } + } + + /// + public Task DeleteCollectionAsync(CancellationToken cancellationToken = default) + { + return this._decoratedVectorStoreRecordCollection.DeleteCollectionAsync(cancellationToken); + } + + /// + public Task DeleteAsync(TKey key, DeleteRecordOptions? options = null, CancellationToken cancellationToken = default) + { + return this._decoratedVectorStoreRecordCollection.DeleteAsync(key, options, cancellationToken); + } + + /// + public Task DeleteBatchAsync(IEnumerable keys, DeleteRecordOptions? options = null, CancellationToken cancellationToken = default) + { + return this._decoratedVectorStoreRecordCollection.DeleteBatchAsync(keys, options, cancellationToken); + } + + /// + public Task GetAsync(TKey key, GetRecordOptions? options = null, CancellationToken cancellationToken = default) + { + return this._decoratedVectorStoreRecordCollection.GetAsync(key, options, cancellationToken); + } + + /// + public IAsyncEnumerable GetBatchAsync(IEnumerable keys, GetRecordOptions? options = null, CancellationToken cancellationToken = default) + { + return this._decoratedVectorStoreRecordCollection.GetBatchAsync(keys, options, cancellationToken); + } + + /// + public async Task UpsertAsync(TRecord record, UpsertRecordOptions? options = null, CancellationToken cancellationToken = default) + { + var recordWithEmbeddings = await this.AddEmbeddingsAsync(record, cancellationToken).ConfigureAwait(false); + return await this._decoratedVectorStoreRecordCollection.UpsertAsync(recordWithEmbeddings, options, cancellationToken).ConfigureAwait(false); + } + + /// + public async IAsyncEnumerable UpsertBatchAsync(IEnumerable records, UpsertRecordOptions? options = null, [EnumeratorCancellation] CancellationToken cancellationToken = default) + { + var recordWithEmbeddingsTasks = records.Select(r => this.AddEmbeddingsAsync(r, cancellationToken)); + var recordWithEmbeddings = await Task.WhenAll(recordWithEmbeddingsTasks).ConfigureAwait(false); + var upsertResults = this._decoratedVectorStoreRecordCollection.UpsertBatchAsync(recordWithEmbeddings, options, cancellationToken); + await foreach (var upsertResult in upsertResults.ConfigureAwait(false)) + { + yield return upsertResult; + } + } + + /// + public Task> VectorizedSearchAsync(TVector vector, VectorSearchOptions? options = null, CancellationToken cancellationToken = default) + { + return this._decoratedVectorStoreRecordCollection.VectorizedSearchAsync(vector, options, cancellationToken); + } + + /// + public async Task> VectorizableTextSearchAsync(string searchText, VectorSearchOptions? options = null, CancellationToken cancellationToken = default) + { + var embeddingValue = await this._textEmbeddingGenerationService.GenerateEmbeddingAsync(searchText, cancellationToken: cancellationToken).ConfigureAwait(false); + return await this.VectorizedSearchAsync(embeddingValue, options, cancellationToken).ConfigureAwait(false); + } + + /// + /// Generate and add embeddings for each embedding field that has a on the provided record. + /// + /// The record to generate embeddings for. + /// The to monitor for cancellation requests. + /// The record with embeddings added. + private async Task AddEmbeddingsAsync(TRecord record, CancellationToken cancellationToken) + { + foreach (var (embeddingPropertyInfo, sourcePropertiesInfo) in this._embeddingPropertiesWithSourceProperties) + { + var sourceValues = sourcePropertiesInfo.Select(x => x.GetValue(record)).Cast().Where(x => !string.IsNullOrWhiteSpace(x)); + var sourceString = string.Join("\n", sourceValues); + + var embeddingValue = await this._textEmbeddingGenerationService.GenerateEmbeddingAsync(sourceString, cancellationToken: cancellationToken).ConfigureAwait(false); + embeddingPropertyInfo.SetValue(record, embeddingValue); + } + + return record; + } + + /// + /// Get the list of properties with from the data model. + /// + /// The type of the data model to find + /// The list of properties with with the properties from which the embedding can be generated. + private static IEnumerable<(PropertyInfo EmbeddingPropertyInfo, IList SourcePropertiesInfo)> FindDataPropertiesWithEmbeddingProperties(Type dataModelType) + { + var allProperties = dataModelType.GetProperties(); + var propertiesDictionary = allProperties.ToDictionary(p => p.Name); + + // Loop through all the properties to find the ones that have the GenerateTextEmbeddingAttribute. + foreach (var property in allProperties) + { + var attribute = property.GetCustomAttribute(); + if (attribute is not null) + { + // Find the source properties that the embedding should be generated from. + var sourcePropertiesInfo = new List(); + foreach (var sourcePropertyName in attribute.SourcePropertyNames) + { + if (!propertiesDictionary.TryGetValue(sourcePropertyName, out var sourcePropertyInfo)) + { + throw new ArgumentException($"The source property '{sourcePropertyName}' as referenced by embedding property '{property.Name}' does not exist in the record model."); + } + else if (sourcePropertyInfo.PropertyType != typeof(string)) + { + throw new ArgumentException($"The source property '{sourcePropertyName}' as referenced by embedding property '{property.Name}' has type {sourcePropertyInfo.PropertyType} but must be a string."); + } + else + { + sourcePropertiesInfo.Add(sourcePropertyInfo); + } + } + + yield return (property, sourcePropertiesInfo); + } + } + } +} diff --git a/dotnet/samples/Concepts/Memory/VectorStore_EmbeddingGeneration.cs b/dotnet/samples/Concepts/Memory/VectorStore_EmbeddingGeneration.cs new file mode 100644 index 000000000000..b641443e878a --- /dev/null +++ b/dotnet/samples/Concepts/Memory/VectorStore_EmbeddingGeneration.cs @@ -0,0 +1,120 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Azure.Identity; +using Memory.VectorStoreEmbeddingGeneration; +using Microsoft.Extensions.VectorData; +using Microsoft.SemanticKernel.Connectors.AzureOpenAI; +using Microsoft.SemanticKernel.Connectors.InMemory; + +namespace Memory; + +/// +/// This sample shows how to abstract embedding generation away from usage by +/// using the decorator pattern. +/// +/// In the sample we create an and then using +/// an extension method +/// we wrap the with a that will automatically generate embeddings for properties +/// that have the attribute. +/// +/// The decorated vector store also adds the additional interface to the collection +/// which allows us to search the collection using a text string without having to manually generate the embeddings. +/// +/// Note that the demonstrated here are part of this sample and not part of the Semantic Kernel libraries. +/// To use it, you will need to copy it to your own project. +/// +public class VectorStore_EmbeddingGeneration(ITestOutputHelper output) : BaseTest(output) +{ + [Fact] + public async Task UseEmbeddingGenerationViaDecoratorAsync() + { + // Create an embedding generation service. + var textEmbeddingGenerationService = new AzureOpenAITextEmbeddingGenerationService( + TestConfiguration.AzureOpenAIEmbeddings.DeploymentName, + TestConfiguration.AzureOpenAIEmbeddings.Endpoint, + new AzureCliCredential()); + + // Construct an InMemory vector store with embedding generation. + // The UseTextEmbeddingGeneration method adds an embedding generation + // decorator class to the vector store that will automatically generate + // embeddings for properties that are decorated with the GenerateTextEmbeddingAttribute. + var vectorStore = new InMemoryVectorStore().UseTextEmbeddingGeneration(textEmbeddingGenerationService); + + // Get and create collection if it doesn't exist. + var collection = vectorStore.GetCollection("skglossary"); + await collection.CreateCollectionIfNotExistsAsync(); + + // Create and upsert glossary entries into the collection. + await collection.UpsertBatchAsync(CreateGlossaryEntries()).ToListAsync(); + + // Search the collection using a vectorizable text search. + var search = collection as IVectorizableTextSearch; + var searchString = "What is an Application Programming Interface"; + var searchResult = await search!.VectorizableTextSearchAsync(searchString, new() { Top = 1 }); + var resultRecords = await searchResult.Results.ToListAsync(); + + Console.WriteLine("Search string: " + searchString); + Console.WriteLine("Result: " + resultRecords.First().Record.Definition); + Console.WriteLine(); + } + + /// + /// Sample model class that represents a glossary entry. + /// + /// + /// Note that each property is decorated with an attribute that specifies how the property should be treated by the vector store. + /// This allows us to create a collection in the vector store and upsert and retrieve instances of this class without any further configuration. + /// + /// The property is also decorated with the attribute which + /// allows the vector store to automatically generate an embedding for the property when the record is upserted. + /// + private sealed class Glossary + { + [VectorStoreRecordKey] + public ulong Key { get; set; } + + [VectorStoreRecordData(IsFilterable = true)] + public string Category { get; set; } + + [VectorStoreRecordData] + public string Term { get; set; } + + [VectorStoreRecordData] + public string Definition { get; set; } + + [GenerateTextEmbedding(nameof(Definition))] + [VectorStoreRecordVector(1536)] + public ReadOnlyMemory DefinitionEmbedding { get; set; } + } + + /// + /// Create some sample glossary entries. + /// + /// A list of sample glossary entries. + private static IEnumerable CreateGlossaryEntries() + { + yield return new Glossary + { + Key = 1, + Category = "External Definitions", + Term = "API", + Definition = "Application Programming Interface. A set of rules and specifications that allow software components to communicate and exchange data." + }; + + yield return new Glossary + { + Key = 2, + Category = "Core Definitions", + Term = "Connectors", + Definition = "Connectors allow you to integrate with various services provide AI capabilities, including LLM, AudioToText, TextToAudio, Embedding generation, etc." + }; + + yield return new Glossary + { + Key = 3, + Category = "External Definitions", + Term = "RAG", + Definition = "Retrieval Augmented Generation - a term that refers to the process of retrieving additional data to provide as context to an LLM to use when generating a response (completion) to a user’s question (prompt)." + }; + } +}