From 110b613164899cae374f63e84edd9aa0182b2630 Mon Sep 17 00:00:00 2001 From: Moritz Meister <8422705+moritzmeister@users.noreply.github.com> Date: Fri, 18 Dec 2020 17:48:12 +0100 Subject: [PATCH] [HOPSWORKS-2206] HSFS profile to install with and without Hive dependencies (#200) --- docs/integrations/python.md | 6 +++++- docs/integrations/sagemaker.md | 6 +++++- python/hsfs/engine/__init__.py | 11 ++++++++++- python/setup.py | 6 ++---- 4 files changed, 22 insertions(+), 7 deletions(-) diff --git a/docs/integrations/python.md b/docs/integrations/python.md index 7124200089..a62568c143 100644 --- a/docs/integrations/python.md +++ b/docs/integrations/python.md @@ -30,9 +30,13 @@ Create a file called `featurestore.key` in your designated Python environment an To be able to access the Hopsworks Feature Store, the `HSFS` Python library needs to be installed in the environment from which you want to connect to the Feature Store. You can install the library through pip. We recommend using a Python environment manager such as *virtualenv* or *conda*. ``` -pip install hsfs~=[HOPSWORKS_VERSION] +pip install hsfs[hive]~=[HOPSWORKS_VERSION] ``` +!!! attention "Hive Dependencies" + + By default, `HSFS` assumes Spark/EMR is used as execution engine and therefore Hive dependencies are not installed. Hence, on a local Python evnironment, if you are planning to use a regular Python Kernel **without Spark/EMR**, make sure to install the **"hive"** extra dependencies (`hsfs[hive]`). + !!! attention "Matching Hopsworks version" The **major version of `HSFS`** needs to match the **major version of Hopsworks**. diff --git a/docs/integrations/sagemaker.md b/docs/integrations/sagemaker.md index ec2d648476..87a3fcc653 100644 --- a/docs/integrations/sagemaker.md +++ b/docs/integrations/sagemaker.md @@ -141,9 +141,13 @@ You have two options to make your API key accessible from SageMaker: To be able to access the Hopsworks Feature Store, the `HSFS` Python library needs to be installed. One way of achieving this is by opening a Python notebook in SageMaker and installing the `HSFS` with a magic command and pip: ``` -!pip install hsfs~=[HOPSWORKS_VERSION] +!pip install hsfs[hive]~=[HOPSWORKS_VERSION] ``` +!!! attention "Hive Dependencies" + + By default, `HSFS` assumes Spark/EMR is used as execution engine and therefore Hive dependencies are not installed. Hence, on AWS SageMaker, if you are planning to use a regular Python Kernel **without Spark/EMR**, make sure to install the **"hive"** extra dependencies (`hsfs[hive]`). + !!! attention "Matching Hopsworks version" The **major version of `HSFS`** needs to match the **major version of Hopsworks**. diff --git a/python/hsfs/engine/__init__.py b/python/hsfs/engine/__init__.py index af2a5507e0..30cb5e06f5 100644 --- a/python/hsfs/engine/__init__.py +++ b/python/hsfs/engine/__init__.py @@ -14,7 +14,8 @@ # limitations under the License. # -from hsfs.engine import spark, hive +from hsfs.engine import spark +from hsfs.client import exceptions _engine = None @@ -25,6 +26,14 @@ def init(engine_type, host=None, cert_folder=None, project=None, cert_key=None): if engine_type == "spark": _engine = spark.Engine() elif engine_type == "hive": + try: + from hsfs.engine import hive + except ImportError: + raise exceptions.FeatureStoreException( + "Trying to instantiate Hive as engine, but 'hive' extras are " + "missing in HSFS installation. Install with `pip install " + "hsfs[hive]`." + ) _engine = hive.Engine(host, cert_folder, project, cert_key) diff --git a/python/setup.py b/python/setup.py index 93b7140276..a84e9e5097 100644 --- a/python/setup.py +++ b/python/setup.py @@ -22,10 +22,7 @@ def read(fname): "boto3", "pandas", "numpy", - "pyhopshive[thrift]", - "PyMySQL", "pyjks", - "sqlalchemy", "mock", ], extras_require={ @@ -37,7 +34,8 @@ def read(fname): "mkdocs", "mkdocs-material", "keras-autodoc", - "markdown-include"] + "markdown-include"], + "hive": ["pyhopshive[thrift]", "sqlalchemy", "PyMySQL"], }, author="Logical Clocks AB", author_email="moritz@logicalclocks.com",