From 0ea83c5a003ee8bbf1eea6265045b00659f4507a Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Tue, 28 Sep 2021 19:36:28 -0700 Subject: [PATCH] feat: add Databricks ODBC engine spec (#16862) * feat: add Databricks ODBC engine spec * Rename Databricks specs --- .../Connecting to Databases/databricks.mdx | 68 +++++++++++++++++++ superset/db_engine_specs/databricks.py | 25 ++++++- superset/db_engine_specs/hive.py | 4 ++ 3 files changed, 96 insertions(+), 1 deletion(-) create mode 100644 docs/src/pages/docs/Connecting to Databases/databricks.mdx diff --git a/docs/src/pages/docs/Connecting to Databases/databricks.mdx b/docs/src/pages/docs/Connecting to Databases/databricks.mdx new file mode 100644 index 0000000000000..4e3d72c156566 --- /dev/null +++ b/docs/src/pages/docs/Connecting to Databases/databricks.mdx @@ -0,0 +1,68 @@ +--- +name: Databricks +menu: Connecting to Databases +route: /docs/databases/databricks +index: 30 +version: 1 +--- + +## Databricks + +To connect to Databricks, first install [databricks-dbapi](https://pypi.org/project/databricks-dbapi/) with the optional SQLAlchemy dependencies: + +```bash +pip install databricks-dbapi[sqlalchemy] +``` + +There are two ways to connect to Databricks: using a Hive connector or an ODBC connector. Both ways work similarly, but only ODBC can be used to connect to [SQL endpoints](https://docs.databricks.com/sql/admin/sql-endpoints.html). + +### Hive + +To use the Hive connector you need the following information from your cluster: + +- Server hostname +- Port +- HTTP path + +These can be found under "Configuration" -> "Advanced Options" -> "JDBC/ODBC". + +You also need an access token from "Settings" -> "User Settings" -> "Access Tokens". + +Once you have all this information, add a database of type "Databricks (Hive)" in Superset, and use the following SQLAlchemy URI: + +``` +databricks+pyhive://token:{access token}@{server hostname}:{port}/{database name} +``` + +You also need to add the following configuration to "Other" -> "Engine Parameters", with your HTTP path: + +``` +{"connect_args": {"http_path": "sql/protocolv1/o/****"}} +``` + +### ODBC + +For ODBC you first need to install the [ODBC drivers for your platform](https://databricks.com/spark/odbc-drivers-download). + +For a regular connection use this as the SQLAlchemy URI: + +``` +databricks+pyodbc://token:{access token}@{server hostname}:{port}/{database name} +``` + +And for the connection arguments: + +``` +{"connect_args": {"http_path": "sql/protocolv1/o/****", "driver_path": "/path/to/odbc/driver"}} +``` + +The driver path should be: + +- `/Library/simba/spark/lib/libsparkodbc_sbu.dylib` (Mac OS) +- `/opt/simba/spark/lib/64/libsparkodbc_sb64.so` (Linux) + +For a connection to a SQL endpoint you need to use the HTTP path from the endpoint: + +``` +{"connect_args": {"http_path": "/sql/1.0/endpoints/****", "driver_path": "/path/to/odbc/driver"}} +``` diff --git a/superset/db_engine_specs/databricks.py b/superset/db_engine_specs/databricks.py index 1c11c4017b736..7f0e44f785564 100644 --- a/superset/db_engine_specs/databricks.py +++ b/superset/db_engine_specs/databricks.py @@ -15,11 +15,34 @@ # specific language governing permissions and limitations # under the License.o +from datetime import datetime +from typing import Optional + +from superset.db_engine_specs.base import BaseEngineSpec from superset.db_engine_specs.hive import HiveEngineSpec class DatabricksHiveEngineSpec(HiveEngineSpec): engine = "databricks" - engine_name = "Databricks" + engine_name = "Databricks Interactive Cluster" driver = "pyhive" _show_functions_column = "function" + + +class DatabricksODBCEngineSpec(BaseEngineSpec): + engine = "databricks" + engine_name = "Databricks SQL Endpoint" + driver = "pyodbc" + + # the syntax for the ODBC engine is identical to the Hive one, so + # we can reuse the expressions from `HiveEngineSpec` + # pylint: disable=protected-access + _time_grain_expressions = HiveEngineSpec._time_grain_expressions + + @classmethod + def convert_dttm(cls, target_type: str, dttm: datetime) -> Optional[str]: + return HiveEngineSpec.convert_dttm(target_type, dttm) + + @classmethod + def epoch_to_dttm(cls) -> str: + return HiveEngineSpec.epoch_to_dttm() diff --git a/superset/db_engine_specs/hive.py b/superset/db_engine_specs/hive.py index b6c3bffdc8084..061ab7f52373a 100644 --- a/superset/db_engine_specs/hive.py +++ b/superset/db_engine_specs/hive.py @@ -257,6 +257,10 @@ def convert_dttm(cls, target_type: str, dttm: datetime) -> Optional[str]: .isoformat(sep=" ", timespec="microseconds")}' AS TIMESTAMP)""" return None + @classmethod + def epoch_to_dttm(cls) -> str: + return "from_unixtime({col})" + @classmethod def adjust_database_uri( cls, uri: URL, selected_schema: Optional[str] = None