This repository has been archived by the owner on Aug 25, 2024. It is now read-only.
forked from LangStream/langstream
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Rag with AWS example (LangStream#617)
- Loading branch information
1 parent
322bde1
commit 07f3aa5
Showing
7 changed files
with
316 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# Indexing a WebSite | ||
|
||
This sample application shows how to use the WebCrawler Source Connector | ||
|
||
## Prerequisites | ||
|
||
- Create a S3 bucket. | ||
- Setup OpenSearch Serverless in your AWS account. | ||
- Setup Bedrock models. | ||
|
||
|
||
## Configure access to the Vector Database | ||
|
||
Export some ENV variables in order to configure access to the database: | ||
|
||
``` | ||
export BEDROCK_ACCESS_KEY=... | ||
export BEDROCK_SECRET_KEY=... | ||
export OPENSEARCH_USERNAME=<aws-access-key-id> | ||
export OPENSEARCH_PASSWORD=<aws-secret-access-key> | ||
export OPENSEARCH_HOST=xxxx.<region>.aoss.amazonaws.com | ||
export OPENSEARCH_REGION=<region> | ||
export S3_BUCKET_NAME=.. | ||
export S3_ACCESS_KEY=.. | ||
export S3_SECRET=.. | ||
export S3_ENDPOINT=https://s3.amazonaws.com | ||
``` | ||
|
||
|
||
## Deploy the LangStream application | ||
|
||
``` | ||
./bin/langstream docker run test -app examples/applications/rag-aws -s examples/secrets/secrets.yaml | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
# | ||
# Copyright DataStax, Inc. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
topics: | ||
- name: "questions-topic" | ||
creation-mode: create-if-not-exists | ||
- name: "answers-topic" | ||
creation-mode: create-if-not-exists | ||
errors: | ||
on-failure: "skip" | ||
pipeline: | ||
- name: "convert-to-structure" | ||
type: "document-to-json" | ||
input: "questions-topic" | ||
configuration: | ||
text-field: "question" | ||
- name: "compute-embeddings" | ||
type: "compute-ai-embeddings" | ||
configuration: | ||
model: "amazon.titan-embed-text-v1" | ||
embeddings-field: "value.embeddings_vector" | ||
text: "{{ value.question }}" | ||
batch-size: 10 | ||
flush-interval: 500 | ||
- name: "lookup-related-documents" | ||
type: "query-vector-db" | ||
configuration: | ||
datasource: "OpenSearch" | ||
query: | | ||
{ | ||
"index": "rag-index-example-3", | ||
"size": 10, | ||
"query": { | ||
"knn": { | ||
"embeddings": { | ||
"vector": ?, | ||
"k": 10 | ||
} | ||
} | ||
} | ||
} | ||
fields: | ||
- "value.embeddings_vector" | ||
output-field: "value.related_documents" | ||
- name: "re-rank documents with MMR" | ||
type: "re-rank" | ||
configuration: | ||
max: 5 # keep only the top 5 documents, because we have an hard limit on the prompt size | ||
field: "value.related_documents" | ||
query-text: "value.question" | ||
query-embeddings: "value.embeddings_vector" | ||
output-field: "value.related_documents" | ||
text-field: "record.document.content" | ||
embeddings-field: "record.document.embeddings" | ||
algorithm: "MMR" | ||
lambda: 0.5 | ||
k1: 1.2 | ||
b: 0.75 | ||
- name: "ai-chat-completions" | ||
type: "ai-chat-completions" | ||
configuration: | ||
model: "anthropic.claude-v2" | ||
completion-field: "value.answer" | ||
min-chunks-per-message: 5 | ||
log-field: "value.prompt" | ||
options: | ||
request-parameters: | ||
temperature: 0.8 | ||
max_tokens_to_sample: 200 | ||
top_p: 0.9 | ||
top_k: 250 | ||
response-completions-expression: "completion" | ||
messages: | ||
- content: | | ||
Human: | ||
The user is asking questions about books. Please provide an answer to him as response and add an explanation. Do not respond with generic information, only use the information provided in this prompt. If you don't know, just say so. | ||
The user question is: "{{ value.question}}?". | ||
Information you can use are the following: | ||
{{# value.related_documents}} | ||
##### | ||
{{ document.content }} | ||
{{/ value.related_documents}} | ||
Assistant: | ||
- name: "Format response" | ||
type: compute | ||
output: "answers-topic" | ||
configuration: | ||
fields: | ||
- name: "value" | ||
type: STRING | ||
expression: "value.answer" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# | ||
# | ||
# Copyright DataStax, Inc. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
configuration: | ||
resources: | ||
- type: "bedrock-configuration" | ||
name: "Bedrock" | ||
configuration: | ||
access-key: "${secrets.bedrock.access-key}" | ||
secret-key: "${secrets.bedrock.secret-key}" | ||
region: "${secrets.bedrock.region}" | ||
- type: "vector-database" | ||
name: "OpenSearch" | ||
configuration: | ||
service: "opensearch" | ||
username: "${secrets.opensearch.username}" | ||
password: "${secrets.opensearch.password}" | ||
host: "${secrets.opensearch.host}" | ||
port: "${secrets.opensearch.port}" | ||
https: "${secrets.opensearch.https}" | ||
region: "${secrets.opensearch.region}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# | ||
# | ||
# Copyright DataStax, Inc. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
gateways: | ||
- id: chat | ||
type: chat | ||
chat-options: | ||
answers-topic: answers-topic | ||
questions-topic: questions-topic | ||
headers: | ||
- key: langstream-client-session-id | ||
value-from-parameters: sessionId |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,99 @@ | ||
# | ||
# Copyright DataStax, Inc. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
name: "Ingest text files from S3" | ||
assets: | ||
- name: "os-index" | ||
asset-type: "opensearch-index" | ||
creation-mode: create-if-not-exists | ||
config: | ||
index-name: "rag-index-example-3" | ||
datasource: "OpenSearch" | ||
settings: | | ||
{ | ||
"index": { | ||
"knn": true, | ||
"knn.algo_param.ef_search": 100 | ||
} | ||
} | ||
mappings: | | ||
{ | ||
"properties": { | ||
"content": { | ||
"type": "text" | ||
}, | ||
"embeddings": { | ||
"type": "knn_vector", | ||
"dimension": 1536 | ||
} | ||
} | ||
} | ||
pipeline: | ||
- name: "Read from S3" | ||
type: "s3-source" | ||
configuration: | ||
bucketName: "${secrets.s3.bucket-name}" | ||
endpoint: "${secrets.s3.endpoint}" | ||
access-key: "${secrets.s3.access-key}" | ||
secret-key: "${secrets.s3.secret}" | ||
region: "${secrets.s3.region}" | ||
idle-time: 5 | ||
- name: "Extract text" | ||
type: "text-extractor" | ||
- name: "Normalise text" | ||
type: "text-normaliser" | ||
configuration: | ||
make-lowercase: true | ||
trim-spaces: true | ||
- name: "Split into chunks" | ||
type: "text-splitter" | ||
configuration: | ||
splitter_type: "RecursiveCharacterTextSplitter" | ||
chunk_size: 400 | ||
separators: ["\n\n", "\n", " ", ""] | ||
keep_separator: false | ||
chunk_overlap: 100 | ||
length_function: "cl100k_base" | ||
- name: "Convert to structured data" | ||
type: "document-to-json" | ||
configuration: | ||
text-field: text | ||
copy-properties: true | ||
- name: "prepare-structure" | ||
type: "compute" | ||
configuration: | ||
fields: | ||
- name: "value.filename" | ||
expression: "properties.url" | ||
type: STRING | ||
- name: "compute-embeddings" | ||
type: "compute-ai-embeddings" | ||
configuration: | ||
model: "amazon.titan-embed-text-v1" | ||
embeddings-field: "value.embeddings_vector" | ||
text: "{{ value.text }}" | ||
batch-size: 10 | ||
flush-interval: 500 | ||
- name: "Write to vector db" | ||
type: "vector-db-sink" | ||
configuration: | ||
datasource: "OpenSearch" | ||
index-name: "rag-index-example-3" | ||
fields: | ||
- name: "embeddings" | ||
expression: "value.embeddings_vector" | ||
- name: "content" | ||
expression: "value.text" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters