diff --git a/docs/integrations/databricks/configuration.md b/docs/integrations/databricks/configuration.md index d694e923c2..13c87bbc68 100644 --- a/docs/integrations/databricks/configuration.md +++ b/docs/integrations/databricks/configuration.md @@ -91,7 +91,7 @@ Once the cluster is running users can establish a connection to the Hopsworks Fe 443, # Port to reach your Hopsworks instance, defaults to 443 'my_project', # Name of your Hopsworks Feature Store project secrets_store='secretsmanager', # Either parameterstore or secretsmanager - hostname_verification=True) # Disable for self-signed certificates + hostname_verification=True # Disable for self-signed certificates ) fs = conn.get_feature_store() # Get the project's default feature store ``` @@ -104,8 +104,8 @@ Once the cluster is running users can establish a connection to the Hopsworks Fe 'my_instance', # DNS of your Feature Store instance 443, # Port to reach your Hopsworks instance, defaults to 443 'my_project', # Name of your Hopsworks Feature Store project - api_key_file="featurestore.key" # For Azure, store the API key locally - hostname_verification=True) # Disable for self-signed certificates + api_key_file="featurestore.key", # For Azure, store the API key locally + hostname_verification=True # Disable for self-signed certificates ) fs = conn.get_feature_store() # Get the project's default feature store ``` diff --git a/docs/integrations/python.md b/docs/integrations/python.md index 20430e687b..7124200089 100644 --- a/docs/integrations/python.md +++ b/docs/integrations/python.md @@ -60,6 +60,10 @@ conn = hsfs.connection( fs = conn.get_feature_store() # Get the project's default feature store ``` +!!! note "Engine" + + `HSFS` uses either Apache Spark or Apache Hive as an execution engine to perform queries against the feature store. The `engine` option of the connection let's you overwrite the default behaviour by setting it to `"hive"` or `"spark"`. By default, `HSFS` will try to use Spark as engine if PySpark is available. So if you have PySpark installed in your local Python environment, but you have not configured Spark, you will have to set `engine='hive'`. Please refer to the [Spark integration guide](spark.md) to configure your local Spark cluster to be able to connect to the Hopsworks Feature Store. + !!! info "Ports" If you have trouble to connect, please ensure that your Feature Store can receive incoming traffic from your Python environment on ports 443, 9083 and 9085. diff --git a/docs/integrations/sagemaker.md b/docs/integrations/sagemaker.md index c2294d36f3..ec2d648476 100644 --- a/docs/integrations/sagemaker.md +++ b/docs/integrations/sagemaker.md @@ -168,11 +168,17 @@ conn = hsfs.connection( 443, # Port to reach your Hopsworks instance, defaults to 443 'my_project', # Name of your Hopsworks Feature Store project secrets_store='secretsmanager', # Either parameterstore or secretsmanager - hostname_verification=True) # Disable for self-signed certificates + hostname_verification=True, # Disable for self-signed certificates + engine='hive' # Choose Hive as engine if you haven't set up AWS EMR ) fs = conn.get_feature_store() # Get the project's default feature store ``` +!!! note "Engine" + + `HSFS` uses either Apache Spark or Apache Hive as an execution engine to perform queries against the feature store. Most AWS SageMaker Kernels have PySpark installed but are not connected to AWS EMR by default, hence, the `engine` option of the connection let's you overwrite the default behaviour. By default, `HSFS` will try to use Spark as engine if PySpark is available, however, if Spark/EMR is not configured, you will have to set the engine manually to `"hive"`. Please refer to the [EMR integration guide](emr/emr_configuration.md) to setup EMR with the Hopsworks Feature Store. + + !!! info "Ports" If you have trouble connecting, please ensure that the Security Group of your Hopsworks instance on AWS is configured to allow incoming traffic from your SageMaker instance on ports 443, 9083 and 9085. See [VPC Security Groups](https://docs.aws.amazon.com/vpc/latest/userguide/VPC_SecurityGroups.html) for more information. If your SageMaker instances are not in the same VPC as your Hopsworks instance and the Hopsworks instance is not accessible from the internet then you will need to configure [VPC Peering on AWS](https://docs.aws.amazon.com/vpc/latest/peering/what-is-vpc-peering.html). diff --git a/docs/integrations/spark.md b/docs/integrations/spark.md index 63c17bdad3..ff9427e933 100644 --- a/docs/integrations/spark.md +++ b/docs/integrations/spark.md @@ -89,12 +89,15 @@ conn = hsfs.connection( host='my_instance', # DNS of your Feature Store instance port=443, # Port to reach your Hopsworks instance, defaults to 443 project='my_project', # Name of your Hopsworks Feature Store project - api_key_value='api_key', # The API key to authenticate with the feature store + api_key_value='api_key', # The API key to authenticate with the feature store hostname_verification=True # Disable for self-signed certificates ) fs = conn.get_feature_store() # Get the project's default feature store ``` +!!! note "Engine" + + `HSFS` uses either Apache Spark or Apache Hive as an execution engine to perform queries against the feature store. The `engine` option of the connection let's you overwrite the default behaviour by setting it to `"hive"` or `"spark"`. By default, `HSFS` will try to use Spark as engine if PySpark is available, hence, no further action should be required if you setup Spark correctly as described above. ## Next Steps diff --git a/python/hsfs/client/__init__.py b/python/hsfs/client/__init__.py index 7a47446bca..87c028c8ac 100644 --- a/python/hsfs/client/__init__.py +++ b/python/hsfs/client/__init__.py @@ -24,6 +24,7 @@ def init( host=None, port=None, project=None, + engine=None, region_name=None, secrets_store=None, hostname_verification=None, @@ -41,6 +42,7 @@ def init( host, port, project, + engine, region_name, secrets_store, hostname_verification, diff --git a/python/hsfs/client/external.py b/python/hsfs/client/external.py index 56fd7da1c7..3ffe52be40 100644 --- a/python/hsfs/client/external.py +++ b/python/hsfs/client/external.py @@ -34,6 +34,7 @@ def __init__( host, port, project, + engine, region_name, secrets_store, hostname_verification, @@ -67,7 +68,9 @@ def __init__( project_info = self._get_project_info(self._project_name) self._project_id = str(project_info["projectId"]) - if cert_folder: + self._cert_key = None + + if engine == "hive": # On external Spark clients (Databricks, Spark Cluster), # certificates need to be provided before the Spark application starts. self._cert_folder_base = cert_folder diff --git a/python/hsfs/connection.py b/python/hsfs/connection.py index cb4b30134c..6cd422b634 100644 --- a/python/hsfs/connection.py +++ b/python/hsfs/connection.py @@ -82,6 +82,11 @@ class Connection: project: The name of the project to connect to. When running on Hopsworks, this defaults to the project from where the client is run from. Defaults to `None`. + engine: Which engine to use, `"spark"` or `"hive"`. Defaults to `None`, which + initializes the engine to Spark if the environment provides Spark, for + example on Hopsworks and Databricks, or falls back on Hive if Spark is not + available, e.g. on local Python environments or AWS SageMaker. This option + allows you to override this behaviour. region_name: The name of the AWS region in which the required secrets are stored, defaults to `"default"`. secrets_store: The secrets storage to be used, either `"secretsmanager"`, @@ -108,6 +113,7 @@ def __init__( host: str = None, port: int = HOPSWORKS_PORT_DEFAULT, project: str = None, + engine: str = None, region_name: str = AWS_DEFAULT_REGION, secrets_store: str = SECRETS_STORE_DEFAULT, hostname_verification: bool = HOSTNAME_VERIFICATION_DEFAULT, @@ -119,6 +125,7 @@ def __init__( self._host = host self._port = port self._project = project + self._engine = engine self._region_name = region_name self._secrets_store = secrets_store self._hostname_verification = hostname_verification @@ -168,48 +175,49 @@ def connect(self): """ self._connected = True try: + # determine engine, needed to init client + if (self._engine is not None and self._engine.lower() == "spark") or ( + self._engine is None and importlib.util.find_spec("pyspark") + ): + self._engine = "spark" + elif (self._engine is not None and self._engine.lower() == "hive") or ( + self._engine is None and not importlib.util.find_spec("pyspark") + ): + self._engine = "hive" + else: + raise ConnectionError( + "Engine you are trying to initialize is unknown. " + "Supported engines are `'spark'` and `'hive'`." + ) + + # init client if client.base.Client.REST_ENDPOINT not in os.environ: - if importlib.util.find_spec("pyspark"): - # databricks, emr, external spark clusters - client.init( - "external", - self._host, - self._port, - self._project, - self._region_name, - self._secrets_store, - self._hostname_verification, - self._trust_store_path, - None, - self._api_key_file, - self._api_key_value, - ) - engine.init("spark") - else: - # aws - client.init( - "external", - self._host, - self._port, - self._project, - self._region_name, - self._secrets_store, - self._hostname_verification, - self._trust_store_path, - self._cert_folder, - self._api_key_file, - self._api_key_value, - ) - engine.init( - "hive", - self._host, - self._cert_folder, - self._project, - client.get_instance()._cert_key, - ) + client.init( + "external", + self._host, + self._port, + self._project, + self._engine, + self._region_name, + self._secrets_store, + self._hostname_verification, + self._trust_store_path, + self._cert_folder, + self._api_key_file, + self._api_key_value, + ) else: client.init("hopsworks") - engine.init("spark") + + # init engine + engine.init( + self._engine, + self._host, + self._cert_folder, + self._project, + client.get_instance()._cert_key, + ) + self._feature_store_api = feature_store_api.FeatureStoreApi() self._project_api = project_api.ProjectApi() self._hosts_api = hosts_api.HostsApi() @@ -239,6 +247,7 @@ def setup_databricks( host: str = None, port: int = HOPSWORKS_PORT_DEFAULT, project: str = None, + engine: str = None, region_name: str = AWS_DEFAULT_REGION, secrets_store: str = SECRETS_STORE_DEFAULT, hostname_verification: bool = HOSTNAME_VERIFICATION_DEFAULT, @@ -259,6 +268,7 @@ def setup_databricks( host, port, project, + engine, region_name, secrets_store, hostname_verification, @@ -286,6 +296,7 @@ def connection( host: str = None, port: int = HOPSWORKS_PORT_DEFAULT, project: str = None, + engine: str = None, region_name: str = AWS_DEFAULT_REGION, secrets_store: str = SECRETS_STORE_DEFAULT, hostname_verification: bool = HOSTNAME_VERIFICATION_DEFAULT, @@ -299,6 +310,7 @@ def connection( host, port, project, + engine, region_name, secrets_store, hostname_verification,