Skip to content

Commit

Permalink
Added support for mapping workspace group to account group by prefix/…
Browse files Browse the repository at this point in the history
…suffix/regex/external id (#650)

Closes #83 #144

---------

Co-authored-by: andrascsillag-db <129275651+andrascsillag-db@users.noreply.github.com>
  • Loading branch information
FastLee and andrascsillag-db authored Dec 6, 2023
1 parent 0a50d63 commit 44fd399
Show file tree
Hide file tree
Showing 8 changed files with 749 additions and 120 deletions.
42 changes: 42 additions & 0 deletions docs/group_name_conflict.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Group Name Conflict Resolution
During the UC upgrade process we migrate all the local workspace group to account level group.
The process is detailed here: [local-group-migration.md](local-group-migration.md)
<br/>
When migrating multiple workspaces we can run into conflicts.
These conflicts occur when groups with the same name in different workspaces have different membership and different use.

## Suggested Workflow
During the installation process we pose the following question:
<br/>
"Do you need to rename the workspace groups to match the account groups' name?"


If the answer is "Yes" a follow up question will be:
<br/>
"Choose How to rename the workspace groups:"
1. Apply a Prefix
2. Apply a Suffix
3. Use Regular Expression Substitution
4. User Regular Expression to extract a value from the account and the workspace
4. Map using External Group ID

The user then input the Prefix/Suffix/Regular Expression.
The install process will validate the regular expression.
The install process will register the selection as regular expression in the configuration YAML file.

We introduce 3 more parameters to the configuration yaml and the group manager:
- workspace_group_regex
- workspace_group_replace
- account_group_regex

When we run the migration process the regular expression substitution will be applied on all groups.


Group Translation Scenarios:

| Scenario | User Input | workspace_group_regex | workspace_group_replace | account_group_regex | Example |
|----------|-----------------------------------------------------------|-----------------------|-------------------------|---------------------|----------------------------------------|
| Prefix | prefix: [Prefix] | ^ | [Prefix] | [EMPTY] | data_engineers --> prod_data_engineers |
| Suffix | suffix: [Prefix] | $ | [Suffix] | [EMPTY] | data_engineers --> data_engineers_prod |
| Substitution | Search Regex: [Regex]<br/>Replace Text:[Replacement_Text] | [WS_Regex] | [ [Replacement_Text] | [Empty] | corp_tech_data_engineers --> prod_data_engineers |
| Partial Lookup | Workspace Regex: [WS_Regex]<br/> Account Regex: [Acct Regex] | [WS_Regex]| [Empty] | [Acct_Regex] | data_engineers(12345) --> data_engs(12345) |
10 changes: 6 additions & 4 deletions src/databricks/labs/ucx/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,12 +170,14 @@ def to_account_client(self) -> AccountClient:
@dataclass
class WorkspaceConfig(_Config["WorkspaceConfig"]):
inventory_database: str

# Group name conversion parameters.
workspace_group_regex: str = None
workspace_group_replace: str = None
account_group_regex: str = None
group_match_by_external_id: bool = False
# Includes group names for migration. If not specified, all matching groups will be picked up
include_group_names: list[str] = None

include_group_names: list[str] | None = None
renamed_group_prefix: str = "ucx-renamed-"

instance_pool_id: str = None
warehouse_id: str = None
connect: ConnectConfig | None = None
Expand Down
125 changes: 113 additions & 12 deletions src/databricks/labs/ucx/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,18 +381,80 @@ def warehouse_type(_):
)
warehouse_id = new_warehouse.id

# Setting up group migration parameters
groups_config_args = {}

while (
self._question(
"Do you need to convert the workspace groups to match the account groups' name?"
" If the workspace groups' names match the account groups' names select "
"no"
" or hit <Enter/Return>.",
default="no",
)
== "yes"
):
logger.info("Setting up group name translation")
groups_config_args["convert_group_names"] = "yes"
choices = {
"Apply a Prefix": "prefix",
"Apply a Suffix": "suffix",
"Regex Substitution": "sub",
"Regex Matching": "match",
"Match By External ID": "external",
"Cancel": "cancel",
}
choice = self._choice_from_dict("Choose how to map the workspace groups:", choices, sort=False)
match choice:
case "cancel":
continue
case "prefix":
prefix = self._group_question(
"Enter a prefix to add to the workspace group name. Use only valid characters"
)
if not prefix:
continue
groups_config_args["workspace_group_match_regex"] = "^"
groups_config_args["workspace_group_replace"] = prefix
case "suffix":
suffix = self._group_question(
"Enter a suffix to add to the workspace group name. Use only valid characters"
)
if not suffix:
continue
groups_config_args["workspace_group_match_regex"] = "$"
groups_config_args["workspace_group_replace"] = suffix
case "sub":
match_value = self._regex_question("Enter a RegEx expression for Substitution")
if not match_value:
continue
sub_value = self._group_question("Enter the substitution value")
if not sub_value:
continue
groups_config_args["workspace_group_match_regex"] = match_value
groups_config_args["workspace_group_replace"] = sub_value
case "matching":
ws_match_value = self._regex_question("Enter a RegEx expression to match on the workspace group")
if not ws_match_value:
continue
acct_match_value = self._regex_question("Enter a RegEx expression to match on the account group")
if not acct_match_value:
continue
groups_config_args["workspace_group_match_regex"] = ws_match_value
groups_config_args["account_group_match_regex"] = acct_match_value
case "external":
groups_config_args["group_match_by_external_id"] = True
break

selected_groups = self._question(
"Comma-separated list of workspace group names to migrate. If not specified, we'll use all "
"account-level groups with matching names to workspace-level groups.",
"account-level groups with matching names to workspace-level groups",
default="<ALL>",
)
backup_group_prefix = self._question("Backup prefix", default="db-temp-")
backup_group_prefix = self._group_question("Backup prefix", default="db-temp-")
log_level = self._question("Log level", default="INFO").upper()
num_threads = int(self._question("Number of threads", default="8"))
groups_config_args = {
"backup_group_prefix": backup_group_prefix,
}

groups_config_args["backup_group_prefix"] = backup_group_prefix
if selected_groups != "<ALL>":
groups_config_args["selected"] = [x.strip() for x in selected_groups.split(",")]
else:
Expand All @@ -406,8 +468,8 @@ def warehouse_type(_):
if (
len(policies_with_external_hms) > 0
and self._question(
"We have identified one or more cluster policies set up for an external metastore. "
"Would you like to set UCX to connect to the external metastore.",
"We have identified one or more cluster policies set up for an external metastore"
"Would you like to set UCX to connect to the external metastore",
default="no",
)
== "yes"
Expand All @@ -422,6 +484,10 @@ def warehouse_type(_):

self._config = WorkspaceConfig(
inventory_database=inventory_database,
workspace_group_regex=groups_config_args.get("workspace_group_regex"),
workspace_group_replace=groups_config_args.get("workspace_group_replace"),
account_group_regex=groups_config_args.get("account_group_regex"),
group_match_by_external_id=groups_config_args.get("group_match_by_external_id"),
include_group_names=groups_config_args["selected"],
renamed_group_prefix=groups_config_args["backup_group_prefix"],
warehouse_id=warehouse_id,
Expand Down Expand Up @@ -580,14 +646,15 @@ def _create_debug(self, remote_wheel: str):
def notebook_link(self, path: str) -> str:
return f"{self._ws.config.host}/#workspace{path}"

def _choice_from_dict(self, text: str, choices: dict[str, Any]) -> Any:
key = self._choice(text, list(choices.keys()))
def _choice_from_dict(self, text: str, choices: dict[str, Any], *, sort: bool = True) -> Any:
key = self._choice(text, list(choices.keys()), sort=sort)
return choices[key]

def _choice(self, text: str, choices: list[Any], *, max_attempts: int = 10) -> str:
def _choice(self, text: str, choices: list[Any], *, max_attempts: int = 10, sort: bool = True) -> str:
if not self._prompts:
return "any"
choices = sorted(choices, key=str.casefold)
if sort:
choices = sorted(choices, key=str.casefold)
numbered = "\n".join(f"\033[1m[{i}]\033[0m \033[36m{v}\033[0m" for i, v in enumerate(choices))
prompt = f"\033[1m{text}\033[0m\n{numbered}\nEnter a number between 0 and {len(choices) - 1}: "
attempt = 0
Expand Down Expand Up @@ -617,6 +684,36 @@ def _question(text: str, *, default: str | None = None) -> str:
return default
return res

@classmethod
def _group_question(cls, text: str, *, default: str | None = None) -> str | None:
attempts_left = NUM_USER_ATTEMPTS
while attempts_left:
group_input = cls._question(text, default=default)
if cls._is_valid_group_str(group_input):
return group_input
else:
attempts_left -= 1
logger.error(
f"{group_input} is an invalid Prefix. It contains invalid characters. "
f"Please try again ({attempts_left} more attempts)"
)
return None

@classmethod
def _regex_question(cls, text: str, *, default: str | None = None) -> str | None:
attempts_left = NUM_USER_ATTEMPTS
while attempts_left:
regex_input = cls._question(text, default=default)
try:
re.compile(regex_input)
return regex_input
except re.error:
attempts_left -= 1
logger.error(
f"{regex_input} is an invalid RegEx expression. Please try again ({attempts_left} more attempts)"
)
return None

def _upload_wheel(self) -> str:
with tempfile.TemporaryDirectory() as tmp_dir:
local_wheel = self._build_wheel(tmp_dir)
Expand Down Expand Up @@ -891,6 +988,10 @@ def _get_ext_hms_conf_from_policy(cluster_policy):
spark_conf_dict[key[11:]] = cluster_policy[key]["value"]
return instance_profile, spark_conf_dict

@staticmethod
def _is_valid_group_str(group_str: str):
return group_str and not re.search(r"[\s#,+ \\<>;]", group_str)

def latest_job_status(self) -> list[dict]:
latest_status = []
for step, job_id in self._state.jobs.items():
Expand Down
54 changes: 49 additions & 5 deletions src/databricks/labs/ucx/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,15 @@ def crawl_groups(cfg: WorkspaceConfig):
sql_backend = RuntimeBackend()
ws = WorkspaceClient(config=cfg.to_databricks_config())
group_manager = GroupManager(
sql_backend, ws, cfg.inventory_database, cfg.include_group_names, cfg.renamed_group_prefix
sql_backend,
ws,
cfg.inventory_database,
cfg.include_group_names,
cfg.renamed_group_prefix,
workspace_group_regex=cfg.workspace_group_regex,
workspace_group_replace=cfg.workspace_group_replace,
account_group_regex=cfg.account_group_regex,
external_id_match=cfg.group_match_by_external_id,
)
group_manager.snapshot()

Expand Down Expand Up @@ -229,7 +237,15 @@ def rename_workspace_local_groups(cfg: WorkspaceConfig):
sql_backend = RuntimeBackend()
ws = WorkspaceClient(config=cfg.to_databricks_config())
group_manager = GroupManager(
sql_backend, ws, cfg.inventory_database, cfg.include_group_names, cfg.renamed_group_prefix
sql_backend,
ws,
cfg.inventory_database,
cfg.include_group_names,
cfg.renamed_group_prefix,
workspace_group_regex=cfg.workspace_group_regex,
workspace_group_replace=cfg.workspace_group_replace,
account_group_regex=cfg.account_group_regex,
external_id_match=cfg.group_match_by_external_id,
)
group_manager.rename_groups()

Expand All @@ -241,7 +257,15 @@ def reflect_account_groups_on_workspace(cfg: WorkspaceConfig):
sql_backend = RuntimeBackend()
ws = WorkspaceClient(config=cfg.to_databricks_config())
group_manager = GroupManager(
sql_backend, ws, cfg.inventory_database, cfg.include_group_names, cfg.renamed_group_prefix
sql_backend,
ws,
cfg.inventory_database,
cfg.include_group_names,
cfg.renamed_group_prefix,
workspace_group_regex=cfg.workspace_group_regex,
workspace_group_replace=cfg.workspace_group_replace,
account_group_regex=cfg.account_group_regex,
external_id_match=cfg.group_match_by_external_id,
)
group_manager.reflect_account_groups_on_workspace()

Expand All @@ -259,7 +283,17 @@ def apply_permissions_to_account_groups(cfg: WorkspaceConfig):
See [interactive tutorial here](https://app.getreprise.com/launch/myM3VNn/)."""
backend = RuntimeBackend()
ws = WorkspaceClient(config=cfg.to_databricks_config())
group_manager = GroupManager(backend, ws, cfg.inventory_database, cfg.include_group_names, cfg.renamed_group_prefix)
group_manager = GroupManager(
backend,
ws,
cfg.inventory_database,
cfg.include_group_names,
cfg.renamed_group_prefix,
workspace_group_regex=cfg.workspace_group_regex,
workspace_group_replace=cfg.workspace_group_replace,
account_group_regex=cfg.account_group_regex,
external_id_match=cfg.group_match_by_external_id,
)

migration_state = group_manager.get_migration_state()
if len(migration_state.groups) == 0:
Expand All @@ -283,7 +317,17 @@ def delete_backup_groups(cfg: WorkspaceConfig):
successfully for all the groups involved."""
backend = RuntimeBackend()
ws = WorkspaceClient(config=cfg.to_databricks_config())
group_manager = GroupManager(backend, ws, cfg.inventory_database, cfg.include_group_names, cfg.renamed_group_prefix)
group_manager = GroupManager(
backend,
ws,
cfg.inventory_database,
cfg.include_group_names,
cfg.renamed_group_prefix,
workspace_group_regex=cfg.workspace_group_regex,
workspace_group_replace=cfg.workspace_group_replace,
account_group_regex=cfg.account_group_regex,
external_id_match=cfg.group_match_by_external_id,
)
group_manager.delete_original_workspace_groups()


Expand Down
Loading

0 comments on commit 44fd399

Please sign in to comment.