From 0032969b2a0314abc3d0604879b72638a47f8492 Mon Sep 17 00:00:00 2001 From: Francisco Javier Arceo Date: Thu, 2 May 2024 08:04:17 -0400 Subject: [PATCH 1/8] updated PGVector docs Signed-off-by: Francisco Javier Arceo --- docs/reference/online-stores/postgres.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/reference/online-stores/postgres.md b/docs/reference/online-stores/postgres.md index 34d4de3488..77a9408d2b 100644 --- a/docs/reference/online-stores/postgres.md +++ b/docs/reference/online-stores/postgres.md @@ -65,10 +65,16 @@ To compare this set of functionality against other online stores, please see the ## PGVector The Postgres online store supports the use of [PGVector](https://github.com/pgvector/pgvector) for storing feature values. -To enable PGVector, set `pgvector_enabled: true` in the online store configuration. +To enable PGVector, set `pgvector_enabled: true` in the online store configuration. + The `vector_len` parameter can be used to specify the length of the vector. The default value is 512. -Then you can use `retrieve_online_documents` to retrieve the top k closest vectors to a query vector. +Please make sure to follow the instructions in the repository, which, as the time of this writing, requires you to +run `CREATE EXTENSION vector;` in the database. + + +Then you can use `retrieve_online_documents` to retrieve the top k closest vectors to a query vector. +For the Retrieval Augmented Generation (RAG) use-case, you have to embed the query prior to passing the query vector. {% code title="python" %} ```python From e242a93282654346a986b61b423048292f531eec Mon Sep 17 00:00:00 2001 From: Francisco Javier Arceo Date: Thu, 2 May 2024 11:57:55 -0400 Subject: [PATCH 2/8] adding distance metric to arguments and defaulting to L2 Signed-off-by: Francisco Javier Arceo --- sdk/python/feast/feature_store.py | 7 +++++++ .../infra/online_stores/contrib/postgres.py | 20 ++++++++++++++++++- .../feast/infra/passthrough_provider.py | 3 ++- sdk/python/feast/infra/provider.py | 1 + sdk/python/tests/foo_provider.py | 1 + 5 files changed, 30 insertions(+), 2 deletions(-) diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index bc492e4208..536b0514e1 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -1740,6 +1740,7 @@ def retrieve_online_documents( feature: str, query: Union[str, List[float]], top_k: int, + distance_metric: str, ) -> OnlineResponse: """ Retrieves the top k closest document features. Note, embeddings are a subset of features. @@ -1750,11 +1751,13 @@ def retrieve_online_documents( references must have format "feature_view:feature", e.g, "document_fv:document_embeddings". query: The query to retrieve the closest document features for. top_k: The number of closest document features to retrieve. + distance_metric: The distance metric to use for retrieval. """ return self._retrieve_online_documents( feature=feature, query=query, top_k=top_k, + distance_metric=distinct_metric, ) def _retrieve_online_documents( @@ -1762,6 +1765,7 @@ def _retrieve_online_documents( feature: str, query: Union[str, List[float]], top_k: int, + distance_metric: str="L2", ): if isinstance(query, str): raise ValueError( @@ -1783,6 +1787,7 @@ def _retrieve_online_documents( requested_feature, query, top_k, + distance_metric, ) # TODO Refactor to better way of populating result @@ -2025,6 +2030,7 @@ def _retrieve_from_online_store( requested_feature: str, query: List[float], top_k: int, + distance_metric: str, ) -> List[Tuple[Timestamp, "FieldStatus.ValueType", Value, Value, Value]]: """ Search and return document features from the online document store. @@ -2035,6 +2041,7 @@ def _retrieve_from_online_store( requested_feature=requested_feature, query=query, top_k=top_k, + distance_metric=distance_metric, ) read_row_protos = [] diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index 6ed0885d13..37673df69d 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -22,6 +22,15 @@ from feast.usage import log_exceptions_and_usage + +SUPPORTED_DISTANCE_METRICS_DICT = { + "L2": "<->", + "inner_product": "<#>", + "cosine": "<=>", + "L1": "<+>", +} + + class PostgreSQLOnlineStoreConfig(PostgreSQLConfig): type: Literal["postgres"] = "postgres" @@ -276,6 +285,7 @@ def retrieve_online_documents( requested_feature: str, embedding: List[float], top_k: int, + distance_metric: str = "L2", ) -> List[ Tuple[ Optional[datetime], @@ -292,6 +302,7 @@ def retrieve_online_documents( requested_feature: The requested feature as the column to search embedding: The query embedding to search for top_k: The number of items to return + distance_metric: The distance metric to use for the search.G Returns: List of tuples containing the event timestamp and the document feature @@ -303,6 +314,12 @@ def retrieve_online_documents( "pgvector is not enabled in the online store configuration" ) + if distance_metric not in SUPPORTED_DISTANCE_METRICS_DICT: + raise ValueError( + f"Distance metric {distance_metric} is not supported. Supported distance metrics are {SUPPORTED_DISTANCE_METRICS_DICT.keys()}" + ) + + distance_metric_sql = SUPPORTED_DISTANCE_METRICS_DICT[distance_metric] # Convert the embedding to a string to be used in postgres vector search query_embedding_str = f"[{','.join(str(el) for el in embedding)}]" @@ -327,13 +344,14 @@ def retrieve_online_documents( feature_name, value, vector_value, - vector_value <-> %s as distance, + vector_value {distance_metric_sql} %s as distance, event_ts FROM {table_name} WHERE feature_name = {feature_name} ORDER BY distance LIMIT {top_k}; """ ).format( + distance_metric_sql=distance_metric_sql, table_name=sql.Identifier(table_name), feature_name=sql.Literal(requested_feature), top_k=sql.Literal(top_k), diff --git a/sdk/python/feast/infra/passthrough_provider.py b/sdk/python/feast/infra/passthrough_provider.py index 6476acbcb9..1732c98667 100644 --- a/sdk/python/feast/infra/passthrough_provider.py +++ b/sdk/python/feast/infra/passthrough_provider.py @@ -196,12 +196,13 @@ def retrieve_online_documents( requested_feature: str, query: List[float], top_k: int, + distance_metric: str, ) -> List: set_usage_attribute("provider", self.__class__.__name__) result = [] if self.online_store: result = self.online_store.retrieve_online_documents( - config, table, requested_feature, query, top_k + config, table, requested_feature, query, top_k, distance_metric, ) return result diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index a45051a1b6..ba25f0bb4d 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -303,6 +303,7 @@ def retrieve_online_documents( requested_feature: str, query: List[float], top_k: int, + distance_metric: str="euclidean", ) -> List[ Tuple[ Optional[datetime], diff --git a/sdk/python/tests/foo_provider.py b/sdk/python/tests/foo_provider.py index 2a830d424c..f869d82e11 100644 --- a/sdk/python/tests/foo_provider.py +++ b/sdk/python/tests/foo_provider.py @@ -111,6 +111,7 @@ def retrieve_online_documents( requested_feature: str, query: List[float], top_k: int, + distance_metric: str, ) -> List[ Tuple[ Optional[datetime], From 6dd845dfbabe21b053e19b75cb0c839aac597d84 Mon Sep 17 00:00:00 2001 From: Francisco Javier Arceo Date: Thu, 2 May 2024 12:07:20 -0400 Subject: [PATCH 3/8] linter Signed-off-by: Francisco Javier Arceo --- sdk/python/feast/feature_store.py | 4 ++-- sdk/python/feast/infra/online_stores/contrib/postgres.py | 2 -- sdk/python/feast/infra/passthrough_provider.py | 7 ++++++- sdk/python/feast/infra/provider.py | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/sdk/python/feast/feature_store.py b/sdk/python/feast/feature_store.py index 536b0514e1..f45dbb1bc8 100644 --- a/sdk/python/feast/feature_store.py +++ b/sdk/python/feast/feature_store.py @@ -1757,7 +1757,7 @@ def retrieve_online_documents( feature=feature, query=query, top_k=top_k, - distance_metric=distinct_metric, + distance_metric=distance_metric, ) def _retrieve_online_documents( @@ -1765,7 +1765,7 @@ def _retrieve_online_documents( feature: str, query: Union[str, List[float]], top_k: int, - distance_metric: str="L2", + distance_metric: str = "L2", ): if isinstance(query, str): raise ValueError( diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index 37673df69d..9d148d2061 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -21,8 +21,6 @@ from feast.repo_config import RepoConfig from feast.usage import log_exceptions_and_usage - - SUPPORTED_DISTANCE_METRICS_DICT = { "L2": "<->", "inner_product": "<#>", diff --git a/sdk/python/feast/infra/passthrough_provider.py b/sdk/python/feast/infra/passthrough_provider.py index 1732c98667..2f3e30018a 100644 --- a/sdk/python/feast/infra/passthrough_provider.py +++ b/sdk/python/feast/infra/passthrough_provider.py @@ -202,7 +202,12 @@ def retrieve_online_documents( result = [] if self.online_store: result = self.online_store.retrieve_online_documents( - config, table, requested_feature, query, top_k, distance_metric, + config, + table, + requested_feature, + query, + top_k, + distance_metric, ) return result diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index ba25f0bb4d..50156707d0 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -303,7 +303,7 @@ def retrieve_online_documents( requested_feature: str, query: List[float], top_k: int, - distance_metric: str="euclidean", + distance_metric: str = "euclidean", ) -> List[ Tuple[ Optional[datetime], From d192c881c82118e632c699029f8a16737a1e8e23 Mon Sep 17 00:00:00 2001 From: Francisco Javier Arceo Date: Thu, 2 May 2024 12:38:06 -0400 Subject: [PATCH 4/8] testing other distance metric Signed-off-by: Francisco Javier Arceo --- .../online_store/test_universal_online.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index 3ae7be9e1e..ea22b2ea51 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -798,6 +798,17 @@ def test_retrieve_online_documents(environment, fake_document_data): fs.write_to_online_store("item_embeddings", df) documents = fs.retrieve_online_documents( - feature="item_embeddings:embedding_float", query=[1.0, 2.0], top_k=2 + feature="item_embeddings:embedding_float", query=[1.0, 2.0], top_k=2, distance_metric="L2", ).to_dict() assert len(documents["embedding_float"]) == 2 + + documents = fs.retrieve_online_documents( + feature="item_embeddings:embedding_float", query=[1.0, 2.0], top_k=2, distance_metric="L1", + ).to_dict() + assert len(documents["embedding_float"]) == 2 + + with pytest.raises(ValueError): + fs.retrieve_online_documents( + feature="item_embeddings:embedding_float", query=[1.0, 2.0], top_k=2, distance_metric="wrong", + ).to_dict() + From 3b9b3da9d997db79e7a868cb7084898df8d4d781 Mon Sep 17 00:00:00 2001 From: Francisco Javier Arceo Date: Thu, 2 May 2024 13:52:49 -0400 Subject: [PATCH 5/8] updated default Signed-off-by: Francisco Javier Arceo --- sdk/python/feast/infra/provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index 50156707d0..8f3217fb5b 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -303,7 +303,7 @@ def retrieve_online_documents( requested_feature: str, query: List[float], top_k: int, - distance_metric: str = "euclidean", + distance_metric: str = "L2", ) -> List[ Tuple[ Optional[datetime], From cf840c31bf6a42eb0a774c5f2b7a56f547facc44 Mon Sep 17 00:00:00 2001 From: Francisco Javier Arceo Date: Thu, 2 May 2024 14:00:36 -0400 Subject: [PATCH 6/8] linter Signed-off-by: Francisco Javier Arceo --- .../online_store/test_universal_online.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/sdk/python/tests/integration/online_store/test_universal_online.py b/sdk/python/tests/integration/online_store/test_universal_online.py index ea22b2ea51..5d6462e5e3 100644 --- a/sdk/python/tests/integration/online_store/test_universal_online.py +++ b/sdk/python/tests/integration/online_store/test_universal_online.py @@ -798,17 +798,25 @@ def test_retrieve_online_documents(environment, fake_document_data): fs.write_to_online_store("item_embeddings", df) documents = fs.retrieve_online_documents( - feature="item_embeddings:embedding_float", query=[1.0, 2.0], top_k=2, distance_metric="L2", + feature="item_embeddings:embedding_float", + query=[1.0, 2.0], + top_k=2, + distance_metric="L2", ).to_dict() assert len(documents["embedding_float"]) == 2 documents = fs.retrieve_online_documents( - feature="item_embeddings:embedding_float", query=[1.0, 2.0], top_k=2, distance_metric="L1", + feature="item_embeddings:embedding_float", + query=[1.0, 2.0], + top_k=2, + distance_metric="L1", ).to_dict() assert len(documents["embedding_float"]) == 2 with pytest.raises(ValueError): fs.retrieve_online_documents( - feature="item_embeddings:embedding_float", query=[1.0, 2.0], top_k=2, distance_metric="wrong", + feature="item_embeddings:embedding_float", + query=[1.0, 2.0], + top_k=2, + distance_metric="wrong", ).to_dict() - From ee3448fb2f6465d5fcfb0a185ca6f352536a3bb9 Mon Sep 17 00:00:00 2001 From: Francisco Javier Arceo Date: Thu, 2 May 2024 14:15:41 -0400 Subject: [PATCH 7/8] fixed some copy Signed-off-by: Francisco Javier Arceo --- sdk/python/feast/infra/online_stores/online_store.py | 2 +- sdk/python/feast/infra/provider.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sdk/python/feast/infra/online_stores/online_store.py b/sdk/python/feast/infra/online_stores/online_store.py index 67c5a931dd..2a81e37042 100644 --- a/sdk/python/feast/infra/online_stores/online_store.py +++ b/sdk/python/feast/infra/online_stores/online_store.py @@ -158,7 +158,7 @@ def retrieve_online_documents( table: The feature view whose feature values should be read. requested_feature: The name of the feature whose embeddings should be used for retrieval. embedding: The embeddings to use for retrieval. - top_k: The number of nearest neighbors to retrieve. + top_k: The number of documents to retrieve. Returns: object: A list of top k closest documents to the specified embedding. Each item in the list is a tuple diff --git a/sdk/python/feast/infra/provider.py b/sdk/python/feast/infra/provider.py index 8f3217fb5b..02fba0c1f6 100644 --- a/sdk/python/feast/infra/provider.py +++ b/sdk/python/feast/infra/provider.py @@ -313,14 +313,14 @@ def retrieve_online_documents( ] ]: """ - Searches for the top-k nearest neighbors of the given document in the online document store. + Searches for the top-k most similar documents in the online document store. Args: config: The config for the current feature store. table: The feature view whose embeddings should be searched. requested_feature: the requested document feature name. query: The query embedding to search for. - top_k: The number of nearest neighbors to return. + top_k: The number of documents to return. Returns: A list of dictionaries, where each dictionary contains the document feature. From 1d88b01fe4024248817a4ec77b943f01db778a35 Mon Sep 17 00:00:00 2001 From: Francisco Javier Arceo Date: Fri, 3 May 2024 16:06:48 -0400 Subject: [PATCH 8/8] updated Signed-off-by: Francisco Javier Arceo --- sdk/python/feast/infra/online_stores/contrib/postgres.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/python/feast/infra/online_stores/contrib/postgres.py b/sdk/python/feast/infra/online_stores/contrib/postgres.py index 9d148d2061..f2c32fdafd 100644 --- a/sdk/python/feast/infra/online_stores/contrib/postgres.py +++ b/sdk/python/feast/infra/online_stores/contrib/postgres.py @@ -22,10 +22,10 @@ from feast.usage import log_exceptions_and_usage SUPPORTED_DISTANCE_METRICS_DICT = { - "L2": "<->", - "inner_product": "<#>", "cosine": "<=>", "L1": "<+>", + "L2": "<->", + "inner_product": "<#>", }