databrickslabs · renardeinside · Jul 25, 2023 · Jun 8, 2023 · Jun 8, 2023 · Jun 9, 2023
@@ -6,45 +6,36 @@ on:
   push:
     branches: [main]
 
+env:
+  HATCH_VERSION: 1.7.0
+
 jobs:
   ci:
     strategy:
       matrix:
-        pyVersion: [ '3.9' ]
+        pyVersion: [ '3.10' ]
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
 
       - name: Unshallow
         run: git fetch --prune --unshallow
 
       - name: Install Python
         uses: actions/setup-python@v4
         with:
+          cache: 'pip'
+          cache-dependency-path: '**/pyproject.toml'
           python-version: ${{ matrix.pyVersion }}
 
-
-      - name: Install Poetry
-        uses: snok/install-poetry@v1
-        with:
-          virtualenvs-create: true
-          virtualenvs-in-project: true
-          installer-parallel: true
-
-
-      - name: Load cache
-        id: cached-poetry-dependencies
-        uses: actions/cache@v3
-        with:
-          path: .venv
-          key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }}
-
-
-      - name: Install project dependencies
-        if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true'
-        run: poetry install --no-interaction --no-root --with=dev
-
+      - name: Install hatch
+        run: pip install hatch==$HATCH_VERSION
 
       - name: Verify linting
-        run: make verify
+        run: |
+          hatch run lint:verify
+
+      - name: Run unit tests
+        run: |
+          hatch run unit:test
@@ -93,8 +93,9 @@ celerybeat.pid
 *.sage.py
 
 # Environments
-.env
+.env.admin
 .venv
+.env.*
 env/
 venv/
 ENV/
@@ -134,3 +135,7 @@ cython_debug/
 
 # ruff
 .ruff_cache
+/scratch
+
+# dev files and scratches
+dev/cleanup.py
@@ -1,11 +0,0 @@
-lint:
-	@echo "Linting the project code"
-	poetry run black .
-	poetry run isort .
-	poetry run ruff . --fix
-
-verify:
-	@echo "Verifying the project code"
-	poetry run black . --check
-	poetry run isort . --check
-	poetry run ruff .

@@ -2,14 +2,17 @@
 
 This repo contains various functions and utilities for UC Upgrade.
 
-
 ## Latest working version and how-to
 
 Please note that current project statis is 🏗️ **WIP**, but we have a minimal set of already working utilities.
+
 To run the notebooks please use latest LTS Databricks Runtime (non-ML), without Photon, in a single-user cluster mode.
-If you have Table ACL Clusters or SQL Warehouse where ACL have been defined, you should create a TableACL cluster to run this notebook
 
-Please note that script is executed only on the driver node, therefore you'll need to use a Single Node Cluster with sufficient amount of cores (e.g. 16 cores).
+> If you have Table ACL Clusters or SQL Warehouse where ACL have been defined, you should create a TableACL cluster to
+> run this notebook.
+
+Please note that script is executed **only** on the driver node, therefore you'll need to use a Single Node Cluster with
+sufficient amount of cores (e.g. 16 cores).
 
 Recommended VM types are:
 
@@ -18,27 +21,131 @@ Recommended VM types are:
 - GCP: `c2-standard-16`
 
 **For now please switch to the `v0.0.1` tag in the GitHub to get the latest working version.**
+**All instructions below are currently in WIP mode.**
+
+## Group migration
+
+During the UC adoption, it's critical to move the groups from the workspace to account level.
+
+To deliver this migration, the following steps are performed:
+
+
+| Step description                                                                                                                                                                                                                                                                                       | Relevant API method                                      |
+|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------|
+| A set of groups to be migrated is identified (either via `groups.selected` config property, or automatically).<br/>Group existence is verified against the account level.<br/>**If there is no group on the account level, an error is thrown.**<br/>Backup groups are created on the workspace level. | `toolkit.prepare_groups_in_environment()`                |
+| Inventory table is cleaned up.                                                                                                                                                                                                                                                                         | `toolkit.cleanup_inventory_table()`                      |
+| Workspace local group permissions are inventorized and saved into a Delta Table.                                                                                                                                                                                                                       | `toolkit.inventorize_permissions()`                      |
+| Backup groups are entitled with permissions from the inventory table.                                                                                                                                                                                                                                  | `toolkit.apply_permissions_to_backup_groups()`           |
+| Workspace-level groups are deleted.  Account-level groups are granted with access to the workspace.<br/>Workspace-level entitlements are synced from backup groups to newly added account-level groups.                                                                                                | `toolkit.replace_workspace_groups_with_account_groups()` |
+| Account-level groups are entitled with workspace-level permissions from the inventory table.                                                                                                                                                                                                           | `toolkit.apply_permissions_to_account_groups()`          |
+| Backup groups are deleted                                                                                                                                                                                                                                                                              | `toolkit.delete_backup_groups()`                         |
+| Inventory table is cleaned up. This step is optional.                                                                                                                                                                                                                                                  | `toolkit.cleanup_inventory_table()`                      |
+
+## Permissions and entitlements that we inventorize
+
+> Please note that inherited permissions will not be inventorized / migrated.
+> We only cover direct permissions.
+
+Group-level:
+
+- [x] Entitlements (One of `workspace-access`, `databricks-sql-access`, `allow-cluster-create`, `allow-instance-pool-create`)
+- [x] Roles (AWS Only, represents Instance Profile Access)
+
+Compute infrastructure:
+
+- [x] Clusters
+- [ ] Cluster policies
+- [ ] Pools
+- [ ] Instance Profile (for AWS)
+
+Workflows:
+
+- [ ] Delta Live Tables
+- [ ] Jobs
+
+ML:
+
+- [ ] MLflow experiments
+- [ ] MLflow registry
+- [ ] Legacy Mlflow model endpoints (?)
+
+SQL:
+
+- [ ] Databricks SQL warehouses
+- [ ] Dashboard
+- [ ] Queries
+- [ ] Alerts
+
+Security:
+
+- [ ] Tokens
+- [ ] Passwords (for AWS)
+- [ ] Secrets
+
+Workspace:
+
+- [ ] Notebooks in the Workspace FS
+- [ ] Directories in the Workspace FS
+- [ ] Files in the Workspace FS
+
+Repos:
 
+- [ ] User-level Repos
+- [ ] Org-level Repos
 
-## Local setup and development process
+Data access:
 
-- Install [poetry](https://python-poetry.org/)
-- Run `poetry install` in the project directory
-- Pin your IDE to use the newly created poetry environment
+- [ ] Table ACLS
 
-> Please note that you **don't** need to use `poetry` inside notebooks or in the Databricks workspace.
-> It's only introduced to simplify local development.
+## Development
 
-Before running `git push`, don't forget to link your code with:
+This section describes setup and development process for the project.
+
+### Local setup
+
+- Install [hatch](https://github.com/pypa/hatch):
+
+```shell
+pip install hatch
+```
+
+- Create environment:
+
+```shell
+hatch env create
+```
+
+- Install dev dependencies:
+
+```shell
+hatch run pip install -e '.[dbconnect]'
+```
+
+- Pin your IDE to use the newly created virtual environment. You can get the python path with:
+
+```shell
+hatch run python -c "import sys; print(sys.executable)"
+```
+
+- You're good to go! 🎉
+
+### Development process
+
+Please note that you **don't** need to use `hatch` inside notebooks or in the Databricks workspace.
+It's only introduced to simplify local development.
+
+Write your code in the IDE. Please keep all relevant files under the `src/uc_migration_toolkit` directory.
+
+Don't forget to test your code via:
 
 ```shell
-make lint
+hatch run test
 ```
 
-### Details of package installation
+Please note that all commits go through the CI process, and it verifies linting. You can run linting locally via:
 
-Since the package itself is managed with `poetry`, to re-use it inside the notebooks we're doing the following:
+```shell
+hatch run lint:fmt
+```
 
-1. Installing the package dependencies via poetry export
-2. Adding the package itself to the notebook via `sys.path`
 
@@ -0,0 +1,45 @@
+from functools import partial
+from pathlib import Path
+
+from databricks.sdk import WorkspaceClient
+from databricks.sdk.service.iam import ComplexValue
+from dotenv import load_dotenv
+
+from uc_migration_toolkit.config import RateLimitConfig
+from uc_migration_toolkit.providers.logger import logger
+from uc_migration_toolkit.utils import ThreadedExecution
+
+Threader = partial(ThreadedExecution, num_threads=40, rate_limit=RateLimitConfig())
+
+
+def _create_user(_ws: WorkspaceClient, uid: str):
+    user_name = f"test-user-{uid}@example.com"
+    potential_user = list(ws.users.list(filter=f"userName eq '{user_name}'"))
+    if potential_user:
+        logger.debug(f"User {user_name} already exists, skipping its creation")
+    else:
+        ws.users.create(
+            active=True,
+            user_name=user_name,
+            display_name=f"test-user-{uid}",
+            emails=[ComplexValue(display=None, primary=True, value=f"test-user-{uid}@example.com")],
+        )
+
+
+def _create_users(_ws: WorkspaceClient):
+    executables = [partial(_create_user, ws, uid) for uid in range(200)]
+    Threader(executables).run()
+
+
+if __name__ == "__main__":
+    principal_env = Path(__file__).parent.parent / ".env.principal"
+    if principal_env.exists():
+        logger.info("Using credentials provided in .env.principal")
+        load_dotenv(dotenv_path=principal_env)
+
+    logger.debug("setting up the workspace client")
+    ws = WorkspaceClient()
+    user_info = ws.current_user.me()
+    logger.debug("workspace client is set up")
+
+    _create_users(ws)
@@ -0,0 +1,14 @@
+inventory:
+  table:
+    catalog: main
+    database: default
+    name: uc_migration_inventory
+
+
+with_table_acls: False
+
+groups:
+  selected: [ "analyst" ]
+
+num_threads: 80
+