From 5516f23aa0b8874667d7f0fb04d04ad2cccee7a7 Mon Sep 17 00:00:00 2001 From: Hassan Khan Date: Wed, 6 Sep 2023 21:26:08 +0000 Subject: [PATCH] docs: add Getting Started with Databricks guide (#7050) --- .../pages/product/getting-started/_meta.js | 3 +- .../product/getting-started/databricks.mdx | 15 ++ .../getting-started/databricks/_meta.js | 7 + .../databricks/connect-to-databricks.mdx | 81 +++++++ .../databricks/create-data-model.mdx | 213 ++++++++++++++++++ .../getting-started/databricks/load-data.mdx | 34 +++ .../databricks/query-from-bi.mdx | 99 ++++++++ .../databricks/query-from-react-app.mdx | 86 +++++++ 8 files changed, 537 insertions(+), 1 deletion(-) create mode 100644 docs/docs-new/pages/product/getting-started/databricks.mdx create mode 100644 docs/docs-new/pages/product/getting-started/databricks/_meta.js create mode 100644 docs/docs-new/pages/product/getting-started/databricks/connect-to-databricks.mdx create mode 100644 docs/docs-new/pages/product/getting-started/databricks/create-data-model.mdx create mode 100644 docs/docs-new/pages/product/getting-started/databricks/load-data.mdx create mode 100644 docs/docs-new/pages/product/getting-started/databricks/query-from-bi.mdx create mode 100644 docs/docs-new/pages/product/getting-started/databricks/query-from-react-app.mdx diff --git a/docs/docs-new/pages/product/getting-started/_meta.js b/docs/docs-new/pages/product/getting-started/_meta.js index ddca6f99a8923..7483482782d4d 100644 --- a/docs/docs-new/pages/product/getting-started/_meta.js +++ b/docs/docs-new/pages/product/getting-started/_meta.js @@ -1,5 +1,6 @@ module.exports = { "core": "Cube Core", "cloud": "Cube Cloud", + "databricks": "Cube Cloud and Databricks", "migrate-from-core": "Migrate from Cube Core" -} \ No newline at end of file +} diff --git a/docs/docs-new/pages/product/getting-started/databricks.mdx b/docs/docs-new/pages/product/getting-started/databricks.mdx new file mode 100644 index 0000000000000..9f4923ac9f9d5 --- /dev/null +++ b/docs/docs-new/pages/product/getting-started/databricks.mdx @@ -0,0 +1,15 @@ +# Getting started with Cube Cloud and Databricks + +This getting started guide will show you how to use Cube Cloud with Databricks. +You will learn how to: + +- Load sample data into your Databricks account +- Connect Cube Cloud to Databricks +- Create your first Cube data model +- Connect to a BI tool to explore this model +- Create React application with Cube REST API + +## Prerequisites + +- [Cube Cloud account](https://cubecloud.dev/auth/signup) +- [Databricks account](https://www.databricks.com/try-databricks) diff --git a/docs/docs-new/pages/product/getting-started/databricks/_meta.js b/docs/docs-new/pages/product/getting-started/databricks/_meta.js new file mode 100644 index 0000000000000..211598d9e2722 --- /dev/null +++ b/docs/docs-new/pages/product/getting-started/databricks/_meta.js @@ -0,0 +1,7 @@ +module.exports = { + "load-data": "Load data", + "connect-to-databricks": "Connect to Databricks", + "create-data-model": "Create data model", + "query-from-bi": "Query from BI", + "query-from-react-app": "Query from React" +} diff --git a/docs/docs-new/pages/product/getting-started/databricks/connect-to-databricks.mdx b/docs/docs-new/pages/product/getting-started/databricks/connect-to-databricks.mdx new file mode 100644 index 0000000000000..08aca3c15f519 --- /dev/null +++ b/docs/docs-new/pages/product/getting-started/databricks/connect-to-databricks.mdx @@ -0,0 +1,81 @@ +# Connect to Databricks + +In this section, we’ll create a Cube Cloud deployment and connect it to +Databricks. A deployment represents a data model, configuration, and managed +infrastructure. + +To continue with this guide, you'll need to have a Cube Cloud account. If you +don't have one yet, [click here to sign up][cube-cloud-signup] for free. + +First, [sign in to your Cube Cloud account][cube-cloud-signin]. Then, +click Create Deployment: + +Give the deployment a name, select the cloud provider and region of your choice, +and click Next: + + + + + +Microsoft Azure is available in Cube Cloud on +[Premium](https://cube.dev/pricing) tier. [Contact us](https://cube.dev/contact) +for details. + + + +## Set up a Cube project + +Next, click Create to create a new project from scratch: + + + +## Connect to your Databricks + +The last step is to connect Cube Cloud to Databricks. First, select it from the +grid: + + + +Then enter your Databricks credentials: + +- **Access Token:** A personal access token for your Databricks account. [You + can generate one][databricks-docs-pat] in your Databricks account settings. +- **Databricks JDBC URL:** The JDBC URL for your Databricks SQL warehouse. [You + can find it][databricks-docs-jdbc-url] in the SQL warehouse settings screen. +- **Databricks Catalog:** This should match the same catalog where you uploaded + the files in the last section. If left unspecified, the `default` catalog is + used. + +[databricks-docs-pat]: + https://docs.databricks.com/en/dev-tools/auth.html#databricks-personal-access-tokens-for-workspace-users +[databricks-docs-jdbc-url]: + https://docs.databricks.com/en/integrations/jdbc-odbc-bi.html#get-connection-details-for-a-sql-warehouse + +Click Apply, Cube Cloud will test the connection and proceed to the +next step. + +## Generate data model from your Databricks schema + +Cube can now generate a basic data model from your data warehouse, which helps +getting started with data modeling faster. Select all four tables in our catalog +and click through the data model generation wizard. We'll inspect these +generated files in the next section and start making changes to them. + +[aws-docs-sec-group]: + https://docs.aws.amazon.com/vpc/latest/userguide/security-groups.html +[aws-docs-sec-group-rule]: + https://docs.aws.amazon.com/vpc/latest/userguide/security-group-rules.html +[cube-cloud-signin]: https://cubecloud.dev/auth +[cube-cloud-signup]: https://cubecloud.dev/auth/signup +[ref-conf-db]: /product/configuration/data-sources +[ref-getting-started-cloud-generate-models]: + /getting-started/cloud/generate-models diff --git a/docs/docs-new/pages/product/getting-started/databricks/create-data-model.mdx b/docs/docs-new/pages/product/getting-started/databricks/create-data-model.mdx new file mode 100644 index 0000000000000..7b69a5f088758 --- /dev/null +++ b/docs/docs-new/pages/product/getting-started/databricks/create-data-model.mdx @@ -0,0 +1,213 @@ +# Create your first data model + +Cube follows a dataset-oriented data modeling approach, which is inspired by and +expands upon dimensional modeling. Cube incorporates this approach and provides +a practical framework for implementing dataset-oriented data modeling. + +When building a data model in Cube, you work with two dataset-centric objects: +**cubes** and **views**. **Cubes** usually represent business entities such as +customers, line items, and orders. In cubes, you define all the calculations +within the measures and dimensions of these entities. Additionally, you define +relationships between cubes, such as "an order has many line items" or "a user +may place multiple orders." + +**Views** sit on top of a data graph of cubes and create a facade of your entire +data model, with which data consumers can interact. You can think of views as +the final data products for your data consumers - BI users, data apps, AI +agents, etc. When building views, you select measures and dimensions from +different connected cubes and present them as a single dataset to BI or data +apps. + + + +## Working with cubes + +To begin building your data model, click on Enter Development Mode in +Cube Cloud. This will take you to your personal developer space, where you can +safely make changes to your data model without affecting the production +environment. + +In the previous section, we generated four cubes. To see the data graph of these +four cubes and how they are connected to each other, click the Show +Graph button on the Data Model page. + +Let's review the `orders` cube first and update it with additional dimensions +and measures. + +Once you are in developer mode, navigate to the Data Model and click +on the `orders.yml` file in the left sidebar inside the `model/cubes` directory +to open it. + +You should see the following content of `model/cubes/orders.yml` file. + +```yaml +cubes: + - name: orders + sql_table: ECOM.ORDERS + + joins: + - name: users + sql: "{CUBE}.USER_ID = {users}.USER_ID" + relationship: many_to_one + + dimensions: + - name: status + sql: STATUS + type: string + + - name: id + sql: ID + type: number + primary_key: true + + - name: created_at + sql: CREATED_AT + type: time + + - name: completed_at + sql: COMPLETED_AT + type: time + + measures: + - name: count + type: count +``` + +As you can see, we already have a `count` measure that we can use to calculate +the total count of our orders. + +Let's add an additional measure to the `orders` cube to calculate only +**completed orders**. The `status` dimension in the `orders` cube reflects the +three possible statuses: **processing**, **shipped**, or **completed**. We will +create a new measure `completed_count` by using a filter on that dimension. To +do this, we will use a +[filter parameter](/product/data-modeling/reference/measures#filters) of the +measure and +[refer](/product/data-modeling/fundamentals/syntax#referring-to-objects) to the +existing dimension. + +Add the following measure definition to your `model/cubes/orders.yml` file. It +should be included within the `measures` block. + +```yaml +- name: completed_count + type: count + filters: + - sql: "{CUBE}.status = 'completed'" +``` + +With these two measures in place, `count` and `completed_count`, we can create a +**derived measure**. Derived measures are measures that you can create based on +existing measures. Let's create the `completed_percentage` derived measure. + +Add the following measure definition to your `model/cubes/orders.yml` file +within the `measures` block. + +```yaml +- name: completed_percentage + type: number + sql: "({completed_count} / NULLIF({count}, 0)) * 100.0" + format: percent +``` + +Below you can see what your updated `orders` cube should look like with two new +measures. Feel free to copy this code and paste it into your +`model/cubes/order.yml` file. + +```yaml +cubes: + - name: orders + sql_table: ECOM.ORDERS + + joins: + - name: users + sql: "{CUBE}.USER_ID = {users}.USER_ID" + relationship: many_to_one + + dimensions: + - name: status + sql: STATUS + type: string + + - name: id + sql: ID + type: number + primary_key: true + + - name: created_at + sql: CREATED_AT + type: time + + - name: completed_at + sql: COMPLETED_AT + type: time + + measures: + - name: count + type: count + + - name: completed_count + type: count + filters: + - sql: "{CUBE}.status = 'completed'" + + - name: completed_percentage + type: number + sql: "({completed_count} / NULLIF({count}, 0)) * 100.0" + format: percent +``` + +Click Save All in the upper corner to save changes to the data model. +Now, you can navigate to Cube’s Playground. The Playground is a web-based tool +that allows you to query your data without connecting any tools or writing any +code. It's the fastest way to explore and test your data model. + +You can select measures and dimensions from different cubes in playground, +including your newly created `completed_percentage` measure. + +## Working with views + +When building views, we recommend following entity-oriented design and +structuring your views around your business entities. Usually, cubes tend to be +normalized entities without duplicated or redundant members, while views are +denormalized entities where you pick as many measures and dimensions from +multiple cubes as needed to describe a business entity. + +Let's create our first view, which will provide all necessary measures and +dimensions to explore orders. Views are usually located in the `views` folder +and have a `_view` postfix. + +Create `model/views/orders_view.yml` with the following content: + +```yaml +views: + - name: orders_view + + cubes: + - join_path: orders + includes: + - status + - created_at + - count + - completed_count + - completed_percentage + + - join_path: orders.users + prefix: true + includes: + - city + - age + - state +``` + +When building views, you can leverage the `cubes` parameter, which enables you +to include measures and dimensions from other cubes in the view. You can build +your view by combining multiple joined cubes and specifying the path by which +they should be joined for that particular view. + +After saving, you can experiment with your newly created view in the Playground. +In the next section, we will learn how to query our `orders_view` using a BI +tool. diff --git a/docs/docs-new/pages/product/getting-started/databricks/load-data.mdx b/docs/docs-new/pages/product/getting-started/databricks/load-data.mdx new file mode 100644 index 0000000000000..1f5b158e62155 --- /dev/null +++ b/docs/docs-new/pages/product/getting-started/databricks/load-data.mdx @@ -0,0 +1,34 @@ +# Load data + +The following steps will guide you through setting up a Databricks account and +uploading the demo dataset, which is stored as CSV files in a public S3 bucket. + +First, download the following files to your local machine: + +- [`line_items.csv`](https://cube-tutorial.s3.us-east-2.amazonaws.com/line_items.csv) +- [`orders.csv`](https://cube-tutorial.s3.us-east-2.amazonaws.com/orders.csv) +- [`users.csv`](https://cube-tutorial.s3.us-east-2.amazonaws.com/users.csv) +- [`products.csv`](https://cube-tutorial.s3.us-east-2.amazonaws.com/products.csv) + +Next, let's ensure we have a SQL warehouse that is active. Log in to your +Databricks account, then from the sidebar, click on SQL → SQL +Warehouses: + + + + + +Ensure the warehouse is active by checking its status; if it is inactive, click + +▶️ to start it. + + + +Next, click New → File upload from the sidebar, and upload +`line_items.csv`. The UI will show a preview of the data within the file; when +ready, click Create table. + +Repeat the above steps for the three other files. diff --git a/docs/docs-new/pages/product/getting-started/databricks/query-from-bi.mdx b/docs/docs-new/pages/product/getting-started/databricks/query-from-bi.mdx new file mode 100644 index 0000000000000..3b5272a271015 --- /dev/null +++ b/docs/docs-new/pages/product/getting-started/databricks/query-from-bi.mdx @@ -0,0 +1,99 @@ +# Query from a BI tool + +You can query Cube using a BI or visualization tool through the Cube SQL API. To +provide a good end-user experience in your BI tool, we recommend mapping the +BI's data model to Cube's semantic layer. This can be done automatically with +Semantic Layer Sync or manually. + +## Semantic Layer Sync + +Semantic Layer Sync programmatically connects a BI tool to Cube and creates or +updates BI-specific entities that correspond to entities within the data model +in Cube, such as cubes, views, measures, and dimensions. + + + +Semantic Layer Sync will synchronize all public cubes and views with connected +BI tools. We recommend making your cubes private and only exposing views. Both +cubes and views are public by default. To make cubes private, set the +[public](/product/data-modeling/reference/cube#public) parameter to `false`. + +```yaml +cubes: + - name: orders + sql_table: ECOM.ORDERS + public: false +``` + +Let’s create our first Semantic Layer Sync with +[Apache Superset](https://superset.apache.org/)! + +You can create a new sync by navigating to the Semantic Layer Sync +tab on the BI Integrations page and clicking + Create +Sync. Follow the steps in the wizard to create a sync. + +Under the hood, Semantic Layer Sync is configured using the `semanticLayerSync` +option in the `cube.js` configuration file. + +Cube uses the Superset API, which requires a `user` and `password` for +authentication. You can use your own username and password or create a new +service account. You can copy a `URL` from any page of your Superset workspace. + +Example `cube.js` configuration file for Superset: + +```yaml +module.exports = { + semanticLayerSync: () => { + return [{ + type: "superset", + name: "Superset Sync", + config: { + user: "mail@example.com", + password: "4dceae-606a03-93ae6dc7", + url: "superset.example.com", + } + }]; + } +}; +``` + +Replace the fields for user, password, and URL with your Superset credentials, +then click on Save All. You can now go to the BI +Integrations page and trigger the synchronization of your newly created +semantic layer. + +After running the sync, navigate to your Superset instance. You should see the +`orders_view` dataset that was created in Superset. Cube automatically maps all +metrics and dimensions in Superset to measures and dimensions in the Cube data +model. + +## Manual Setup + +Alternatively, you can connect to Cube and create all the mappings manually. To +do this, navigate to your Apache Superset instance and connect to Cube Cloud as +if it were a Postgres database. + +You can find the credentials to connect to Cube on the BI +Integrations page under the SQL API Connection tab. + +After connecting, create a new dataset in Superset and select "orders_view" as a +table. Now you can map Superset metrics and columns to Cube's measures and +dimensions. + + + +As you can see, we use the `MEASURE` function in the "SQL expression" field. +This function informs Cube that we are querying the measure and that it should +be evaluated based on Cube's data model. You can now query Cube from Superset, +as shown in the image below. + + + +In the next section, we will learn how to use Cube's REST API to query our view +from a React app. diff --git a/docs/docs-new/pages/product/getting-started/databricks/query-from-react-app.mdx b/docs/docs-new/pages/product/getting-started/databricks/query-from-react-app.mdx new file mode 100644 index 0000000000000..e12b2df6c0de8 --- /dev/null +++ b/docs/docs-new/pages/product/getting-started/databricks/query-from-react-app.mdx @@ -0,0 +1,86 @@ +# Query from a React app + +Cube offers both [REST](/product/apis-integrations/rest-api) and +[GraphQL](/product/apis-integrations/graphql-api) APIs, which can be used to +query data from applications built in React or other frontend frameworks. + +You can find your REST API endpoint on the Overview page. In +development mode, Cube creates an isolated endpoint for testing data model +changes without affecting production. The structure of your REST API endpoint in +development mode should follow the format below. + +```yaml +https://..cubecloudapp.dev/dev-mode//cubejs-api/v1 +``` + +To test your REST API from your terminal, you can use [curl](https://curl.se/). +Click on “How to connect your application” next to the REST API, and it will +display a code snippet that you can run in your terminal to test the endpoint +with curl. + + + +Cube offers a frontend JavaScript SDK, as well as a React integration that you +can use in your application. + +First, you’ll need to install two packages from `npm`: + +- [@cubejs-client/core](https://www.npmjs.com/package/@cubejs-client/core) +- [@cubejs-client/react](https://www.npmjs.com/package/@cubejs-client/react) + +Next, initialize `cubejsApi` within your application. + +Please note that you must sign your request with the correct authentication +token. Cube uses the [JSON Web Token (JWT)](https://jwt.io/) standard by default +to authenticate requests. You can copy a temporary token from the "How to +connect to your application" modal window. For production use, you must generate +this token from your secret key. You can learn more about this in the +[Authentication & Authorization](/product/auth) section of the documentation. + +```jsx +import cubejs from "@cubejs-client/core"; + +const cubejsApi = cubejs("your-token", { + apiUrl: + "https://..cubecloudapp.dev/dev-mode//cubejs-api/v1", +}); +``` + +The Cube React package includes a `CubeProvider` that can be used in your React +application. + +```jsx +import { CubeProvider } from "@cubejs-client/react"; + +// your application; +``` + +Finally, you can use the `useCubeQuery` hook to load data from Cube into your +React application. + +```jsx +import { useCubeQuery } from '@cubejs-client/react'; +... +const { resultSet, isLoading, error, progress } = useCubeQuery({ + "measures": ["orders_view.completed_count"], + "timeDimensions": [ + { + "dimension": "orders_view.created_at", + "granularity": "month" + } + ] +}); +``` + +For more information on the Cube JavaScript frontend package and integration +with React, please refer to the documentation. + +You can also explore example applications built with React on top of the Cube +REST API, along with their source code. + +- [React with Highcharts](https://highcharts-demo.cube.dev/#/) +- [React with AG Grid](https://react-pivot-table-demo.cube.dev/#/) +- [React query builder](https://react-dashboard-demo.cube.dev/#/)