feat: add Databricks ODBC engine spec (#16862)

* feat: add Databricks ODBC engine spec * Rename Databricks specs
apache · Sep 29, 2021 · 0ea83c5 · 0ea83c5
1 parent aa74721
commit 0ea83c5
Show file tree

Hide file tree

Showing 3 changed files with 96 additions and 1 deletion.
diff --git a/docs/src/pages/docs/Connecting to Databases/databricks.mdx b/docs/src/pages/docs/Connecting to Databases/databricks.mdx
@@ -0,0 +1,68 @@
+---
+name: Databricks
+menu: Connecting to Databases
+route: /docs/databases/databricks
+index: 30
+version: 1
+---
+
+## Databricks
+
+To connect to Databricks, first install [databricks-dbapi](https://pypi.org/project/databricks-dbapi/) with the optional SQLAlchemy dependencies:
+
+```bash
+pip install databricks-dbapi[sqlalchemy]
+```
+
+There are two ways to connect to Databricks: using a Hive connector or an ODBC connector. Both ways work similarly, but only ODBC can be used to connect to [SQL endpoints](https://docs.databricks.com/sql/admin/sql-endpoints.html).
+
+### Hive
+
+To use the Hive connector you need the following information from your cluster:
+
+- Server hostname
+- Port
+- HTTP path
+
+These can be found under "Configuration" -> "Advanced Options" -> "JDBC/ODBC".
+
+You also need an access token from "Settings" -> "User Settings" -> "Access Tokens".
+
+Once you have all this information, add a database of type "Databricks (Hive)" in Superset, and use the following SQLAlchemy URI:
+
+```
+databricks+pyhive://token:{access token}@{server hostname}:{port}/{database name}
+```
+
+You also need to add the following configuration to "Other" -> "Engine Parameters", with your HTTP path:
+
+```
+{"connect_args": {"http_path": "sql/protocolv1/o/****"}}
+```
+
+### ODBC
+
+For ODBC you first need to install the [ODBC drivers for your platform](https://databricks.com/spark/odbc-drivers-download).
+
+For a regular connection use this as the SQLAlchemy URI:
+
+```
+databricks+pyodbc://token:{access token}@{server hostname}:{port}/{database name}
+```
+
+And for the connection arguments:
+
+```
+{"connect_args": {"http_path": "sql/protocolv1/o/****", "driver_path": "/path/to/odbc/driver"}}
+```
+
+The driver path should be:
+
+- `/Library/simba/spark/lib/libsparkodbc_sbu.dylib` (Mac OS)
+- `/opt/simba/spark/lib/64/libsparkodbc_sb64.so` (Linux)
+
+For a connection to a SQL endpoint you need to use the HTTP path from the endpoint:
+
+```
+{"connect_args": {"http_path": "/sql/1.0/endpoints/****", "driver_path": "/path/to/odbc/driver"}}
+```
diff --git a/superset/db_engine_specs/databricks.py b/superset/db_engine_specs/databricks.py
@@ -15,11 +15,34 @@
 # specific language governing permissions and limitations
 # under the License.o
 
+from datetime import datetime
+from typing import Optional
+
+from superset.db_engine_specs.base import BaseEngineSpec
 from superset.db_engine_specs.hive import HiveEngineSpec
 
 
 class DatabricksHiveEngineSpec(HiveEngineSpec):
     engine = "databricks"
-    engine_name = "Databricks"
+    engine_name = "Databricks Interactive Cluster"
     driver = "pyhive"
     _show_functions_column = "function"
+
+
+class DatabricksODBCEngineSpec(BaseEngineSpec):
+    engine = "databricks"
+    engine_name = "Databricks SQL Endpoint"
+    driver = "pyodbc"
+
+    # the syntax for the ODBC engine is identical to the Hive one, so
+    # we can reuse the expressions from `HiveEngineSpec`
+    # pylint: disable=protected-access
+    _time_grain_expressions = HiveEngineSpec._time_grain_expressions
+
+    @classmethod
+    def convert_dttm(cls, target_type: str, dttm: datetime) -> Optional[str]:
+        return HiveEngineSpec.convert_dttm(target_type, dttm)
+
+    @classmethod
+    def epoch_to_dttm(cls) -> str:
+        return HiveEngineSpec.epoch_to_dttm()
diff --git a/superset/db_engine_specs/hive.py b/superset/db_engine_specs/hive.py
@@ -257,6 +257,10 @@ def convert_dttm(cls, target_type: str, dttm: datetime) -> Optional[str]:
                 .isoformat(sep=" ", timespec="microseconds")}' AS TIMESTAMP)"""
         return None
 
+    @classmethod
+    def epoch_to_dttm(cls) -> str:
+        return "from_unixtime({col})"
+
     @classmethod
     def adjust_database_uri(
         cls, uri: URL, selected_schema: Optional[str] = None