feat: Support image recognition for orchestration (#435)

* chore: add sample code and e2e test for image recognition * chore: update orchestration README for image * feat: export image content types * chore: update documentation * fix: Changes from lint * chore: update doc * chore: add changeset --------- Co-authored-by: cloud-sdk-js <cloud-sdk-js@github.com>
SAP · Jan 10, 2025 · 1da2caa · 1da2caa
1 parent b7f7f75
commit 1da2caa
Showing 8 changed files with 118 additions and 13 deletions.
diff --git a/.changeset/fair-clocks-join.md b/.changeset/fair-clocks-join.md
@@ -0,0 +1,5 @@
+---
+'@sap-ai-sdk/orchestration': minor
+---
+
+[New Functionality] Support image recognition for orchestration service.
diff --git a/packages/orchestration/README.md b/packages/orchestration/README.md
@@ -73,7 +73,7 @@ import { OrchestrationClient } from '@sap-ai-sdk/orchestration';
 
 const orchestrationClient = new OrchestrationClient({
   llm: {
-    model_name: 'gpt-4-32k',
+    model_name: 'gpt-4o',
     model_params: { max_tokens: 50, temperature: 0.1 },
     model_version: 'latest'
   },
@@ -96,7 +96,7 @@ import { OrchestrationClient } from '@sap-ai-sdk/orchestration';
 
 const orchestrationClient = new OrchestrationClient({
   llm: {
-    model_name: 'gpt-4-32k',
+    model_name: 'gpt-4o',
     model_params: { max_tokens: 50, temperature: 0.1 }
   },
   templating: {
@@ -165,7 +165,7 @@ import { OrchestrationClient } from '@sap-ai-sdk/orchestration';
 
 const orchestrationClient = new OrchestrationClient({
   llm: {
-    model_name: 'gpt-4-32k',
+    model_name: 'gpt-4o',
     model_params: { max_tokens: 50, temperature: 0.1 }
   },
   templating: {
@@ -194,6 +194,50 @@ const response = await orchestrationClient.chatCompletion({
 const responseContent = response.getContent();
 ```
 
+#### Image Recognition
+
+Many models in the orchestration service have image recognition capabilities, meaning the models can take images and answer questions about them.
+
+```ts
+import { OrchestrationClient } from '@sap-ai-sdk/orchestration';
+
+const orchestrationClient = new OrchestrationClient({
+  llm: {
+    model_name: 'gpt-4o',
+    model_params: {}
+  },
+  templating: {
+    template: [
+      {
+        role: 'user',
+        content: [
+          {
+            type: 'text',
+            text: 'What is the content of the image?'
+          },
+          {
+            type: 'image_url',
+            image_url: {
+              url: '{{?imageUrl}}'
+            }
+          }
+        ]
+      }
+    ]
+  }
+});
+
+const response = await orchestrationClient.chatCompletion({
+  inputParams: {
+    imageUrl: 'IMAGE_URL'
+  }
+});
+```
+
+`IMAGE_URL` can either be a public URL or a base64 encoded image, e.g., `data:image/jpeg;base64,...`.
+The model can take multiple images.
+It will process each image and use the information from all of them to answer the question.
+
 ### Content Filtering
 
 Use the orchestration client with filtering to restrict content that is passed to and received from a generative AI model.
@@ -209,7 +253,7 @@ import {
 const filter = buildAzureContentFilter({ Hate: 2, Violence: 4 });
 const orchestrationClient = new OrchestrationClient({
   llm: {
-    model_name: 'gpt-4-32k',
+    model_name: 'gpt-4o',
     model_params: { max_tokens: 50, temperature: 0.1 }
   },
   templating: {
@@ -255,7 +299,7 @@ You can anonymize or pseudonomize the prompt using the data masking capabilities
 ```ts
 const orchestrationClient = new OrchestrationClient({
   llm: {
-    model_name: 'gpt-4-32k',
+    model_name: 'gpt-4o',
     model_params: {}
   },
   templating: {
@@ -352,7 +396,7 @@ The resource group can be used as an additional parameter to pick the right orch
 const orchestrationClient = new OrchestrationClient(
   {
     llm: {
-      model_name: 'gpt-4-32k',
+      model_name: 'gpt-4p',
       model_params: { max_tokens: 50, temperature: 0.1 }
     },
     templating: {

diff --git a/packages/orchestration/src/index.ts b/packages/orchestration/src/index.ts
@@ -31,7 +31,11 @@ export type {
   ChatMessage,
   AzureThreshold,
   AzureContentSafety,
-  AzureContentSafetyFilterConfig
+  AzureContentSafetyFilterConfig,
+  ImageContent,
+  TextContent,
+  MultiChatMessageContent,
+  MultiChatMessage
 } from './client/api/schema/index.js';
 
 export type {

diff --git a/sample-code/README.md b/sample-code/README.md
@@ -131,10 +131,16 @@ Get chat completion response for a given static input.
 
 #### Templating
 
-`GET orchestration/template`
+`GET /orchestration/template`
 
 Get chat completion response with template and input parameters.
-Define variable by wrapping it with `{{? ... }}`.
+Define variable by wrapping it with `{{?...}}`.
+
+#### Image Recognition
+
+`GET /orchestration/image`
+
+Get chat completion response with image input.
 
 #### Input Filtering
 

diff --git a/sample-code/src/index.ts b/sample-code/src/index.ts
@@ -13,7 +13,8 @@ export {
   orchestrationRequestConfig,
   orchestrationCompletionMasking,
   orchestrationFromJSON,
-  orchestrationGrounding
+  orchestrationGrounding,
+  orchestrationChatCompletionImage
 } from './orchestration.js';
 export {
   invoke,

diff --git a/sample-code/src/orchestration.ts b/sample-code/src/orchestration.ts
@@ -268,3 +268,39 @@ export async function orchestrationGrounding(): Promise<OrchestrationResponse> {
     }
   });
 }
+
+/**
+ * Ask about the image content using a template.
+ * @returns The orchestration service response.
+ */
+export async function orchestrationChatCompletionImage(): Promise<OrchestrationResponse> {
+  const orchestrationClient = new OrchestrationClient({
+    llm,
+    templating: {
+      template: [
+        {
+          role: 'user',
+          content: [
+            {
+              type: 'text',
+              text: 'Describe the image.'
+            },
+            {
+              type: 'image_url',
+              image_url: {
+                url: '{{?imageUrl}}'
+              }
+            }
+          ]
+        }
+      ]
+    }
+  });
+
+  return orchestrationClient.chatCompletion({
+    inputParams: {
+      imageUrl:
+        'https://upload.wikimedia.org/wikipedia/commons/thumb/5/59/SAP_2011_logo.svg/440px-SAP_2011_logo.svg.png'
+    }
+  });
+}
diff --git a/sample-code/src/server.ts b/sample-code/src/server.ts
@@ -14,7 +14,8 @@ import {
   orchestrationOutputFiltering,
   orchestrationRequestConfig,
   orchestrationFromJSON,
-  orchestrationGrounding
+  orchestrationGrounding,
+  orchestrationChatCompletionImage
 } from './orchestration.js';
 import {
   getDeployments,
@@ -251,7 +252,8 @@ app.get('/orchestration/:sampleCase', async (req, res) => {
       inputFiltering: orchestrationInputFiltering,
       outputFiltering: orchestrationOutputFiltering,
       requestConfig: orchestrationRequestConfig,
-      fromJSON: orchestrationFromJSON
+      fromJSON: orchestrationFromJSON,
+      image: orchestrationChatCompletionImage
     }[sampleCase] || orchestrationChatCompletion;
 
   try {

diff --git a/tests/e2e-tests/src/orchestration.test.ts b/tests/e2e-tests/src/orchestration.test.ts
@@ -4,7 +4,8 @@ import {
   orchestrationInputFiltering,
   orchestrationOutputFiltering,
   orchestrationRequestConfig,
-  orchestrationCompletionMasking
+  orchestrationCompletionMasking,
+  orchestrationChatCompletionImage
 } from '@sap-ai-sdk/sample-code';
 import { loadEnv } from './utils/load-env.js';
 import type { OrchestrationResponse } from '@sap-ai-sdk/orchestration';
@@ -55,4 +56,10 @@ describe('orchestration', () => {
     const result = await orchestrationCompletionMasking();
     expect(result).toEqual(expect.any(String));
   });
+
+  it('should complete a chat with image', async () => {
+    const response = await orchestrationChatCompletionImage();
+    expect(response.getContent()?.includes('SAP')).toBe(true);
+    expect(response.getContent()?.includes('logo')).toBe(true);
+  });
 });