From f3633e2985e8a6bf0a9f2571c552828fec7e91fa Mon Sep 17 00:00:00 2001
From: westey <164392973+westey-m@users.noreply.github.com>
Date: Mon, 4 Nov 2024 13:44:12 +0000
Subject: [PATCH] .Net: Add sample to show how to create a decorator for
generating embeddings. (#9502)
### Motivation and Context
Generating embeddings directly every time can be frustrating and it's
nice to hide this functionality away from the main code
#9483
### Description
Adding a sample that shows how to create a set of decorator classes for
Vector Stores to automatically generate embeddings on upsert and when
using VectorizableTextSearch.
### Contribution Checklist
- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [x] I didn't break anyone :smile:
---
.../GenerateTextEmbeddingAttribute.cs | 39 ++++
.../TextEmbeddingVectorStore.cs | 48 +++++
.../TextEmbeddingVectorStoreExtensions.cs | 40 ++++
...extEmbeddingVectorStoreRecordCollection.cs | 193 ++++++++++++++++++
.../Memory/VectorStore_EmbeddingGeneration.cs | 120 +++++++++++
5 files changed, 440 insertions(+)
create mode 100644 dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/GenerateTextEmbeddingAttribute.cs
create mode 100644 dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStore.cs
create mode 100644 dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStoreExtensions.cs
create mode 100644 dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStoreRecordCollection.cs
create mode 100644 dotnet/samples/Concepts/Memory/VectorStore_EmbeddingGeneration.cs
diff --git a/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/GenerateTextEmbeddingAttribute.cs b/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/GenerateTextEmbeddingAttribute.cs
new file mode 100644
index 000000000000..9a8e6b17aa27
--- /dev/null
+++ b/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/GenerateTextEmbeddingAttribute.cs
@@ -0,0 +1,39 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+namespace Memory.VectorStoreEmbeddingGeneration;
+
+///
+/// An attribute that can be used for an embedding property to indicate that it should
+/// be generated from one or more text properties located on the same class.
+///
+///
+/// This class is part of the sample.
+///
+[AttributeUsage(AttributeTargets.Property, AllowMultiple = false, Inherited = true)]
+public sealed class GenerateTextEmbeddingAttribute : Attribute
+{
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// The name of the property that the embedding should be generated from.
+#pragma warning disable CA1019 // Define accessors for attribute arguments
+ public GenerateTextEmbeddingAttribute(string sourcePropertyName)
+#pragma warning restore CA1019 // Define accessors for attribute arguments
+ {
+ this.SourcePropertyNames = [sourcePropertyName];
+ }
+
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// The names of the properties that the embedding should be generated from.
+ public GenerateTextEmbeddingAttribute(string[] sourcePropertyNames)
+ {
+ this.SourcePropertyNames = sourcePropertyNames;
+ }
+
+ ///
+ /// Gets the name of the property to use as the source for generating the embedding.
+ ///
+ public string[] SourcePropertyNames { get; }
+}
diff --git a/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStore.cs b/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStore.cs
new file mode 100644
index 000000000000..6848b38af48f
--- /dev/null
+++ b/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStore.cs
@@ -0,0 +1,48 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using Microsoft.Extensions.VectorData;
+using Microsoft.SemanticKernel.Embeddings;
+
+namespace Memory.VectorStoreEmbeddingGeneration;
+
+///
+/// Decorator for a that generates embeddings for records on upsert.
+///
+///
+/// This class is part of the sample.
+///
+public class TextEmbeddingVectorStore : IVectorStore
+{
+ /// The decorated .
+ private readonly IVectorStore _decoratedVectorStore;
+
+ /// The service to use for generating the embeddings.
+ private readonly ITextEmbeddingGenerationService _textEmbeddingGenerationService;
+
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// The decorated .
+ /// The service to use for generating the embeddings.
+ public TextEmbeddingVectorStore(IVectorStore decoratedVectorStore, ITextEmbeddingGenerationService textEmbeddingGenerationService)
+ {
+ // Verify & Assign.
+ this._decoratedVectorStore = decoratedVectorStore ?? throw new ArgumentNullException(nameof(decoratedVectorStore));
+ this._textEmbeddingGenerationService = textEmbeddingGenerationService ?? throw new ArgumentNullException(nameof(textEmbeddingGenerationService));
+ }
+
+ ///
+ public IVectorStoreRecordCollection GetCollection(string name, VectorStoreRecordDefinition? vectorStoreRecordDefinition = null)
+ where TKey : notnull
+ {
+ var collection = this._decoratedVectorStore.GetCollection(name, vectorStoreRecordDefinition);
+ var embeddingStore = new TextEmbeddingVectorStoreRecordCollection(collection, this._textEmbeddingGenerationService);
+ return embeddingStore;
+ }
+
+ ///
+ public IAsyncEnumerable ListCollectionNamesAsync(CancellationToken cancellationToken = default)
+ {
+ return this._decoratedVectorStore.ListCollectionNamesAsync(cancellationToken);
+ }
+}
diff --git a/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStoreExtensions.cs b/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStoreExtensions.cs
new file mode 100644
index 000000000000..e1b6c779fdb8
--- /dev/null
+++ b/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStoreExtensions.cs
@@ -0,0 +1,40 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using Microsoft.Extensions.VectorData;
+using Microsoft.SemanticKernel.Embeddings;
+
+namespace Memory.VectorStoreEmbeddingGeneration;
+
+///
+/// Contains extension methods to help add text embedding generation to a or
+///
+///
+/// This class is part of the sample.
+///
+public static class TextEmbeddingVectorStoreExtensions
+{
+ ///
+ /// Add text embedding generation to a .
+ ///
+ /// The to add text embedding generation to.
+ /// The service to use for generating text embeddings.
+ /// The with text embedding added.
+ public static IVectorStore UseTextEmbeddingGeneration(this IVectorStore vectorStore, ITextEmbeddingGenerationService textEmbeddingGenerationService)
+ {
+ return new TextEmbeddingVectorStore(vectorStore, textEmbeddingGenerationService);
+ }
+
+ ///
+ /// Add text embedding generation to a .
+ ///
+ /// The to add text embedding generation to.
+ /// The service to use for generating text embeddings.
+ /// The data type of the record key.
+ /// The record data model to use for adding, updating and retrieving data from the store.
+ /// The with text embedding added.
+ public static IVectorStoreRecordCollection UseTextEmbeddingGeneration(this IVectorStoreRecordCollection vectorStoreRecordCollection, ITextEmbeddingGenerationService textEmbeddingGenerationService)
+ where TKey : notnull
+ {
+ return new TextEmbeddingVectorStoreRecordCollection(vectorStoreRecordCollection, textEmbeddingGenerationService);
+ }
+}
diff --git a/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStoreRecordCollection.cs b/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStoreRecordCollection.cs
new file mode 100644
index 000000000000..a3c5517653af
--- /dev/null
+++ b/dotnet/samples/Concepts/Memory/VectorStoreEmbeddingGeneration/TextEmbeddingVectorStoreRecordCollection.cs
@@ -0,0 +1,193 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using Microsoft.Extensions.VectorData;
+using Microsoft.SemanticKernel.Embeddings;
+
+namespace Memory.VectorStoreEmbeddingGeneration;
+
+///
+/// Decorator for a that generates embeddings for records on upsert and when using .
+///
+///
+/// This class is part of the sample.
+///
+/// The data type of the record key.
+/// The record data model to use for adding, updating and retrieving data from the store.
+#pragma warning disable CA1711 // Identifiers should not have incorrect suffix
+public class TextEmbeddingVectorStoreRecordCollection : IVectorStoreRecordCollection, IVectorizableTextSearch
+#pragma warning restore CA1711 // Identifiers should not have incorrect suffix
+ where TKey : notnull
+{
+ /// The decorated .
+ private readonly IVectorStoreRecordCollection _decoratedVectorStoreRecordCollection;
+
+ /// The service to use for generating the embeddings.
+ private readonly ITextEmbeddingGenerationService _textEmbeddingGenerationService;
+
+ /// Optional configuration options for this class.
+ private readonly IEnumerable<(PropertyInfo EmbeddingPropertyInfo, IList SourcePropertiesInfo)> _embeddingPropertiesWithSourceProperties;
+
+ ///
+ /// Initializes a new instance of the class.
+ ///
+ /// The decorated .
+ /// The service to use for generating the embeddings.
+ /// Thrown when embedding properties are referencing data source properties that do not exist.
+ /// Thrown when required parameters are null.
+ public TextEmbeddingVectorStoreRecordCollection(IVectorStoreRecordCollection decoratedVectorStoreRecordCollection, ITextEmbeddingGenerationService textEmbeddingGenerationService)
+ {
+ // Assign.
+ this._decoratedVectorStoreRecordCollection = decoratedVectorStoreRecordCollection ?? throw new ArgumentNullException(nameof(decoratedVectorStoreRecordCollection));
+ this._textEmbeddingGenerationService = textEmbeddingGenerationService ?? throw new ArgumentNullException(nameof(textEmbeddingGenerationService));
+
+ // Find all the embedding properties to generate embeddings for.
+ this._embeddingPropertiesWithSourceProperties = FindDataPropertiesWithEmbeddingProperties(typeof(TRecord));
+ }
+
+ ///
+ public string CollectionName => this._decoratedVectorStoreRecordCollection.CollectionName;
+
+ ///
+ public Task CollectionExistsAsync(CancellationToken cancellationToken = default)
+ {
+ return this._decoratedVectorStoreRecordCollection.CollectionExistsAsync(cancellationToken);
+ }
+
+ ///
+ public Task CreateCollectionAsync(CancellationToken cancellationToken = default)
+ {
+ return this._decoratedVectorStoreRecordCollection.CreateCollectionAsync(cancellationToken);
+ }
+
+ ///
+ public async Task CreateCollectionIfNotExistsAsync(CancellationToken cancellationToken = default)
+ {
+ if (!await this.CollectionExistsAsync(cancellationToken).ConfigureAwait(false))
+ {
+ await this.CreateCollectionAsync(cancellationToken).ConfigureAwait(false);
+ }
+ }
+
+ ///
+ public Task DeleteCollectionAsync(CancellationToken cancellationToken = default)
+ {
+ return this._decoratedVectorStoreRecordCollection.DeleteCollectionAsync(cancellationToken);
+ }
+
+ ///
+ public Task DeleteAsync(TKey key, DeleteRecordOptions? options = null, CancellationToken cancellationToken = default)
+ {
+ return this._decoratedVectorStoreRecordCollection.DeleteAsync(key, options, cancellationToken);
+ }
+
+ ///
+ public Task DeleteBatchAsync(IEnumerable keys, DeleteRecordOptions? options = null, CancellationToken cancellationToken = default)
+ {
+ return this._decoratedVectorStoreRecordCollection.DeleteBatchAsync(keys, options, cancellationToken);
+ }
+
+ ///
+ public Task GetAsync(TKey key, GetRecordOptions? options = null, CancellationToken cancellationToken = default)
+ {
+ return this._decoratedVectorStoreRecordCollection.GetAsync(key, options, cancellationToken);
+ }
+
+ ///
+ public IAsyncEnumerable GetBatchAsync(IEnumerable keys, GetRecordOptions? options = null, CancellationToken cancellationToken = default)
+ {
+ return this._decoratedVectorStoreRecordCollection.GetBatchAsync(keys, options, cancellationToken);
+ }
+
+ ///
+ public async Task UpsertAsync(TRecord record, UpsertRecordOptions? options = null, CancellationToken cancellationToken = default)
+ {
+ var recordWithEmbeddings = await this.AddEmbeddingsAsync(record, cancellationToken).ConfigureAwait(false);
+ return await this._decoratedVectorStoreRecordCollection.UpsertAsync(recordWithEmbeddings, options, cancellationToken).ConfigureAwait(false);
+ }
+
+ ///
+ public async IAsyncEnumerable UpsertBatchAsync(IEnumerable records, UpsertRecordOptions? options = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)
+ {
+ var recordWithEmbeddingsTasks = records.Select(r => this.AddEmbeddingsAsync(r, cancellationToken));
+ var recordWithEmbeddings = await Task.WhenAll(recordWithEmbeddingsTasks).ConfigureAwait(false);
+ var upsertResults = this._decoratedVectorStoreRecordCollection.UpsertBatchAsync(recordWithEmbeddings, options, cancellationToken);
+ await foreach (var upsertResult in upsertResults.ConfigureAwait(false))
+ {
+ yield return upsertResult;
+ }
+ }
+
+ ///
+ public Task> VectorizedSearchAsync(TVector vector, VectorSearchOptions? options = null, CancellationToken cancellationToken = default)
+ {
+ return this._decoratedVectorStoreRecordCollection.VectorizedSearchAsync(vector, options, cancellationToken);
+ }
+
+ ///
+ public async Task> VectorizableTextSearchAsync(string searchText, VectorSearchOptions? options = null, CancellationToken cancellationToken = default)
+ {
+ var embeddingValue = await this._textEmbeddingGenerationService.GenerateEmbeddingAsync(searchText, cancellationToken: cancellationToken).ConfigureAwait(false);
+ return await this.VectorizedSearchAsync(embeddingValue, options, cancellationToken).ConfigureAwait(false);
+ }
+
+ ///
+ /// Generate and add embeddings for each embedding field that has a on the provided record.
+ ///
+ /// The record to generate embeddings for.
+ /// The to monitor for cancellation requests.
+ /// The record with embeddings added.
+ private async Task AddEmbeddingsAsync(TRecord record, CancellationToken cancellationToken)
+ {
+ foreach (var (embeddingPropertyInfo, sourcePropertiesInfo) in this._embeddingPropertiesWithSourceProperties)
+ {
+ var sourceValues = sourcePropertiesInfo.Select(x => x.GetValue(record)).Cast().Where(x => !string.IsNullOrWhiteSpace(x));
+ var sourceString = string.Join("\n", sourceValues);
+
+ var embeddingValue = await this._textEmbeddingGenerationService.GenerateEmbeddingAsync(sourceString, cancellationToken: cancellationToken).ConfigureAwait(false);
+ embeddingPropertyInfo.SetValue(record, embeddingValue);
+ }
+
+ return record;
+ }
+
+ ///
+ /// Get the list of properties with from the data model.
+ ///
+ /// The type of the data model to find
+ /// The list of properties with with the properties from which the embedding can be generated.
+ private static IEnumerable<(PropertyInfo EmbeddingPropertyInfo, IList SourcePropertiesInfo)> FindDataPropertiesWithEmbeddingProperties(Type dataModelType)
+ {
+ var allProperties = dataModelType.GetProperties();
+ var propertiesDictionary = allProperties.ToDictionary(p => p.Name);
+
+ // Loop through all the properties to find the ones that have the GenerateTextEmbeddingAttribute.
+ foreach (var property in allProperties)
+ {
+ var attribute = property.GetCustomAttribute();
+ if (attribute is not null)
+ {
+ // Find the source properties that the embedding should be generated from.
+ var sourcePropertiesInfo = new List();
+ foreach (var sourcePropertyName in attribute.SourcePropertyNames)
+ {
+ if (!propertiesDictionary.TryGetValue(sourcePropertyName, out var sourcePropertyInfo))
+ {
+ throw new ArgumentException($"The source property '{sourcePropertyName}' as referenced by embedding property '{property.Name}' does not exist in the record model.");
+ }
+ else if (sourcePropertyInfo.PropertyType != typeof(string))
+ {
+ throw new ArgumentException($"The source property '{sourcePropertyName}' as referenced by embedding property '{property.Name}' has type {sourcePropertyInfo.PropertyType} but must be a string.");
+ }
+ else
+ {
+ sourcePropertiesInfo.Add(sourcePropertyInfo);
+ }
+ }
+
+ yield return (property, sourcePropertiesInfo);
+ }
+ }
+ }
+}
diff --git a/dotnet/samples/Concepts/Memory/VectorStore_EmbeddingGeneration.cs b/dotnet/samples/Concepts/Memory/VectorStore_EmbeddingGeneration.cs
new file mode 100644
index 000000000000..b641443e878a
--- /dev/null
+++ b/dotnet/samples/Concepts/Memory/VectorStore_EmbeddingGeneration.cs
@@ -0,0 +1,120 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using Azure.Identity;
+using Memory.VectorStoreEmbeddingGeneration;
+using Microsoft.Extensions.VectorData;
+using Microsoft.SemanticKernel.Connectors.AzureOpenAI;
+using Microsoft.SemanticKernel.Connectors.InMemory;
+
+namespace Memory;
+
+///
+/// This sample shows how to abstract embedding generation away from usage by
+/// using the decorator pattern.
+///
+/// In the sample we create an and then using
+/// an extension method
+/// we wrap the with a that will automatically generate embeddings for properties
+/// that have the attribute.
+///
+/// The decorated vector store also adds the additional interface to the collection
+/// which allows us to search the collection using a text string without having to manually generate the embeddings.
+///
+/// Note that the demonstrated here are part of this sample and not part of the Semantic Kernel libraries.
+/// To use it, you will need to copy it to your own project.
+///
+public class VectorStore_EmbeddingGeneration(ITestOutputHelper output) : BaseTest(output)
+{
+ [Fact]
+ public async Task UseEmbeddingGenerationViaDecoratorAsync()
+ {
+ // Create an embedding generation service.
+ var textEmbeddingGenerationService = new AzureOpenAITextEmbeddingGenerationService(
+ TestConfiguration.AzureOpenAIEmbeddings.DeploymentName,
+ TestConfiguration.AzureOpenAIEmbeddings.Endpoint,
+ new AzureCliCredential());
+
+ // Construct an InMemory vector store with embedding generation.
+ // The UseTextEmbeddingGeneration method adds an embedding generation
+ // decorator class to the vector store that will automatically generate
+ // embeddings for properties that are decorated with the GenerateTextEmbeddingAttribute.
+ var vectorStore = new InMemoryVectorStore().UseTextEmbeddingGeneration(textEmbeddingGenerationService);
+
+ // Get and create collection if it doesn't exist.
+ var collection = vectorStore.GetCollection("skglossary");
+ await collection.CreateCollectionIfNotExistsAsync();
+
+ // Create and upsert glossary entries into the collection.
+ await collection.UpsertBatchAsync(CreateGlossaryEntries()).ToListAsync();
+
+ // Search the collection using a vectorizable text search.
+ var search = collection as IVectorizableTextSearch;
+ var searchString = "What is an Application Programming Interface";
+ var searchResult = await search!.VectorizableTextSearchAsync(searchString, new() { Top = 1 });
+ var resultRecords = await searchResult.Results.ToListAsync();
+
+ Console.WriteLine("Search string: " + searchString);
+ Console.WriteLine("Result: " + resultRecords.First().Record.Definition);
+ Console.WriteLine();
+ }
+
+ ///
+ /// Sample model class that represents a glossary entry.
+ ///
+ ///
+ /// Note that each property is decorated with an attribute that specifies how the property should be treated by the vector store.
+ /// This allows us to create a collection in the vector store and upsert and retrieve instances of this class without any further configuration.
+ ///
+ /// The property is also decorated with the attribute which
+ /// allows the vector store to automatically generate an embedding for the property when the record is upserted.
+ ///
+ private sealed class Glossary
+ {
+ [VectorStoreRecordKey]
+ public ulong Key { get; set; }
+
+ [VectorStoreRecordData(IsFilterable = true)]
+ public string Category { get; set; }
+
+ [VectorStoreRecordData]
+ public string Term { get; set; }
+
+ [VectorStoreRecordData]
+ public string Definition { get; set; }
+
+ [GenerateTextEmbedding(nameof(Definition))]
+ [VectorStoreRecordVector(1536)]
+ public ReadOnlyMemory DefinitionEmbedding { get; set; }
+ }
+
+ ///
+ /// Create some sample glossary entries.
+ ///
+ /// A list of sample glossary entries.
+ private static IEnumerable CreateGlossaryEntries()
+ {
+ yield return new Glossary
+ {
+ Key = 1,
+ Category = "External Definitions",
+ Term = "API",
+ Definition = "Application Programming Interface. A set of rules and specifications that allow software components to communicate and exchange data."
+ };
+
+ yield return new Glossary
+ {
+ Key = 2,
+ Category = "Core Definitions",
+ Term = "Connectors",
+ Definition = "Connectors allow you to integrate with various services provide AI capabilities, including LLM, AudioToText, TextToAudio, Embedding generation, etc."
+ };
+
+ yield return new Glossary
+ {
+ Key = 3,
+ Category = "External Definitions",
+ Term = "RAG",
+ Definition = "Retrieval Augmented Generation - a term that refers to the process of retrieving additional data to provide as context to an LLM to use when generating a response (completion) to a user’s question (prompt)."
+ };
+ }
+}